Modernize additional V256 code from review

1 year ago · 6238f00895
8 changed files with 175 additions and 176 deletions
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@ -619,29 +619,6 @@ internal static partial class SimdUtils
            return va + (vm0 * vm1);
        }

-        /// <summary>
-        /// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
-        /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
-        /// </summary>
-        /// <remarks>ret = (vm0 * vm1) - vs</remarks>
-        /// <param name="vs">The vector to subtract from the intermediate result.</param>
-        /// <param name="vm0">The first vector to multiply.</param>
-        /// <param name="vm1">The second vector to multiply.</param>
-        /// <returns>The <see cref="Vector256{T}"/>.</returns>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static Vector256<float> MultiplySubtract(
-            Vector256<float> vs,
-            Vector256<float> vm0,
-            Vector256<float> vm1)
-        {
-            if (Fma.IsSupported)
-            {
-                return Fma.MultiplySubtract(vm1, vm0, vs);
-            }
-
-            return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
-        }
-
        /// <summary>
        /// Performs a multiplication and a negated addition of the <see cref="Vector256{Single}"/>.
        /// </summary>
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@ -140,6 +140,28 @@ internal static class Vector256_
        return va + (vm0 * vm1);
    }

+    /// <summary>
+    /// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
+    /// </summary>
+    /// <remarks>ret = (vm0 * vm1) - vs</remarks>
+    /// <param name="vs">The vector to subtract from the intermediate result.</param>
+    /// <param name="vm0">The first vector to multiply.</param>
+    /// <param name="vm1">The second vector to multiply.</param>
+    /// <returns>The <see cref="Vector256{T}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<float> MultiplySubtract(
+        Vector256<float> vs,
+        Vector256<float> vm0,
+        Vector256<float> vm1)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplySubtract(vm1, vm0, vs);
+        }
+
+        return (vm0 * vm1) - vs;
+    }
+
    /// <summary>
    /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
    /// </summary>
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@ -211,10 +211,10 @@ internal partial struct Block8x8
    }

    /// <summary>
-    /// Transpose the block inplace.
+    /// Transpose the block in place.
    /// </summary>
    [MethodImpl(InliningOptions.ShortMethod)]
-    public void TransposeInplace()
+    public void TransposeInPlace()
    {
        ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);

--- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
@ -1,142 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Six Labors Split License.
-
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-
-namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
-
-internal static partial class FloatingPointDCT
-{
-    /// <summary>
-    /// Apply floating point FDCT inplace using simd operations.
-    /// </summary>
-    /// <param name="block">Input block.</param>
-    private static void FDCT8x8_Avx(ref Block8x8F block)
-    {
-        DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
-
-        // First pass - process columns
-        FDCT8x8_1D_Avx(ref block);
-
-        // Second pass - process rows
-        block.TransposeInPlace();
-        FDCT8x8_1D_Avx(ref block);
-
-        // Applies 1D floating point FDCT inplace
-        static void FDCT8x8_1D_Avx(ref Block8x8F block)
-        {
-            Vector256<float> tmp0 = Avx.Add(block.V256_0, block.V256_7);
-            Vector256<float> tmp7 = Avx.Subtract(block.V256_0, block.V256_7);
-            Vector256<float> tmp1 = Avx.Add(block.V256_1, block.V256_6);
-            Vector256<float> tmp6 = Avx.Subtract(block.V256_1, block.V256_6);
-            Vector256<float> tmp2 = Avx.Add(block.V256_2, block.V256_5);
-            Vector256<float> tmp5 = Avx.Subtract(block.V256_2, block.V256_5);
-            Vector256<float> tmp3 = Avx.Add(block.V256_3, block.V256_4);
-            Vector256<float> tmp4 = Avx.Subtract(block.V256_3, block.V256_4);
-
-            // Even part
-            Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
-            Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
-            Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
-            Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
-
-            block.V256_0 = Avx.Add(tmp10, tmp11);
-            block.V256_4 = Avx.Subtract(tmp10, tmp11);
-
-            var mm256_F_0_7071 = Vector256.Create(0.707106781f);
-            Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
-            block.V256_2 = Avx.Add(tmp13, z1);
-            block.V256_6 = Avx.Subtract(tmp13, z1);
-
-            // Odd part
-            tmp10 = Avx.Add(tmp4, tmp5);
-            tmp11 = Avx.Add(tmp5, tmp6);
-            tmp12 = Avx.Add(tmp6, tmp7);
-
-            Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), Vector256.Create(0.382683433f));         // mm256_F_0_3826
-            Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10);    // mm256_F_0_5411
-            Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12);    // mm256_F_1_3065
-            Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
-
-            Vector256<float> z11 = Avx.Add(tmp7, z3);
-            Vector256<float> z13 = Avx.Subtract(tmp7, z3);
-
-            block.V256_5 = Avx.Add(z13, z2);
-            block.V256_3 = Avx.Subtract(z13, z2);
-            block.V256_1 = Avx.Add(z11, z4);
-            block.V256_7 = Avx.Subtract(z11, z4);
-        }
-    }
-
-    /// <summary>
-    /// Apply floating point IDCT inplace using simd operations.
-    /// </summary>
-    /// <param name="transposedBlock">Transposed input block.</param>
-    private static void IDCT8x8_Avx(ref Block8x8F transposedBlock)
-    {
-        DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
-
-        // First pass - process columns
-        IDCT8x8_1D_Avx(ref transposedBlock);
-
-        // Second pass - process rows
-        transposedBlock.TransposeInPlace();
-        IDCT8x8_1D_Avx(ref transposedBlock);
-
-        // Applies 1D floating point FDCT inplace
-        static void IDCT8x8_1D_Avx(ref Block8x8F block)
-        {
-            // Even part
-            Vector256<float> tmp0 = block.V256_0;
-            Vector256<float> tmp1 = block.V256_2;
-            Vector256<float> tmp2 = block.V256_4;
-            Vector256<float> tmp3 = block.V256_6;
-
-            Vector256<float> z5 = tmp0;
-            Vector256<float> tmp10 = Avx.Add(z5, tmp2);
-            Vector256<float> tmp11 = Avx.Subtract(z5, tmp2);
-
-            var mm256_F_1_4142 = Vector256.Create(1.414213562f);
-            Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
-            Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
-
-            tmp0 = Avx.Add(tmp10, tmp13);
-            tmp3 = Avx.Subtract(tmp10, tmp13);
-            tmp1 = Avx.Add(tmp11, tmp12);
-            tmp2 = Avx.Subtract(tmp11, tmp12);
-
-            // Odd part
-            Vector256<float> tmp4 = block.V256_1;
-            Vector256<float> tmp5 = block.V256_3;
-            Vector256<float> tmp6 = block.V256_5;
-            Vector256<float> tmp7 = block.V256_7;
-
-            Vector256<float> z13 = Avx.Add(tmp6, tmp5);
-            Vector256<float> z10 = Avx.Subtract(tmp6, tmp5);
-            Vector256<float> z11 = Avx.Add(tmp4, tmp7);
-            Vector256<float> z12 = Avx.Subtract(tmp4, tmp7);
-
-            tmp7 = Avx.Add(z11, z13);
-            tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);
-
-            z5 = Avx.Multiply(Avx.Add(z10, z12), Vector256.Create(1.847759065f));                   // mm256_F_1_8477
-
-            tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f));   // mm256_F_n1_0823
-            tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f));   // mm256_F_n2_6131
-
-            tmp6 = Avx.Subtract(tmp12, tmp7);
-            tmp5 = Avx.Subtract(tmp11, tmp6);
-            tmp4 = Avx.Subtract(tmp10, tmp5);
-
-            block.V256_0 = Avx.Add(tmp0, tmp7);
-            block.V256_7 = Avx.Subtract(tmp0, tmp7);
-            block.V256_1 = Avx.Add(tmp1, tmp6);
-            block.V256_6 = Avx.Subtract(tmp1, tmp6);
-            block.V256_2 = Avx.Add(tmp2, tmp5);
-            block.V256_5 = Avx.Subtract(tmp2, tmp5);
-            block.V256_3 = Avx.Add(tmp3, tmp4);
-            block.V256_4 = Avx.Subtract(tmp3, tmp4);
-        }
-    }
-}
--- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs
@ -0,0 +1,142 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.Intrinsics;
+using SixLabors.ImageSharp.Common.Helpers;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal static partial class FloatingPointDCT
+{
+    /// <summary>
+    /// Apply floating point FDCT in place using simd operations.
+    /// </summary>
+    /// <param name="block">Input block.</param>
+    private static void FDCT8x8_Vector256(ref Block8x8F block)
+    {
+        DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
+
+        // First pass - process columns
+        FDCT8x8_1D_Vector256(ref block);
+
+        // Second pass - process rows
+        block.TransposeInPlace();
+        FDCT8x8_1D_Vector256(ref block);
+
+        // Applies 1D floating point FDCT in place
+        static void FDCT8x8_1D_Vector256(ref Block8x8F block)
+        {
+            Vector256<float> tmp0 = block.V256_0 + block.V256_7;
+            Vector256<float> tmp7 = block.V256_0 - block.V256_7;
+            Vector256<float> tmp1 = block.V256_1 + block.V256_6;
+            Vector256<float> tmp6 = block.V256_1 - block.V256_6;
+            Vector256<float> tmp2 = block.V256_2 + block.V256_5;
+            Vector256<float> tmp5 = block.V256_2 - block.V256_5;
+            Vector256<float> tmp3 = block.V256_3 + block.V256_4;
+            Vector256<float> tmp4 = block.V256_3 - block.V256_4;
+
+            // Even part
+            Vector256<float> tmp10 = tmp0 + tmp3;
+            Vector256<float> tmp13 = tmp0 - tmp3;
+            Vector256<float> tmp11 = tmp1 + tmp2;
+            Vector256<float> tmp12 = tmp1 - tmp2;
+
+            block.V256_0 = tmp10 + tmp11;
+            block.V256_4 = tmp10 - tmp11;
+
+            Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
+            Vector256<float> z1 = (tmp12 + tmp13) * mm256_F_0_7071;
+            block.V256_2 = tmp13 + z1;
+            block.V256_6 = tmp13 - z1;
+
+            // Odd part
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            Vector256<float> z5 = (tmp10 - tmp12) * Vector256.Create(0.382683433f);    // mm256_F_0_3826
+            Vector256<float> z2 = Vector256_.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10);    // mm256_F_0_5411
+            Vector256<float> z4 = Vector256_.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12);    // mm256_F_1_3065
+            Vector256<float> z3 = tmp11 * mm256_F_0_7071;
+
+            Vector256<float> z11 = tmp7 + z3;
+            Vector256<float> z13 = tmp7 - z3;
+
+            block.V256_5 = z13 + z2;
+            block.V256_3 = z13 - z2;
+            block.V256_1 = z11 + z4;
+            block.V256_7 = z11 - z4;
+        }
+    }
+
+    /// <summary>
+    /// Apply floating point IDCT in place using simd operations.
+    /// </summary>
+    /// <param name="transposedBlock">Transposed input block.</param>
+    private static void IDCT8x8_Vector256(ref Block8x8F transposedBlock)
+    {
+        DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
+
+        // First pass - process columns
+        IDCT8x8_1D_Vector256(ref transposedBlock);
+
+        // Second pass - process rows
+        transposedBlock.TransposeInPlace();
+        IDCT8x8_1D_Vector256(ref transposedBlock);
+
+        // Applies 1D floating point FDCT in place
+        static void IDCT8x8_1D_Vector256(ref Block8x8F block)
+        {
+            // Even part
+            Vector256<float> tmp0 = block.V256_0;
+            Vector256<float> tmp1 = block.V256_2;
+            Vector256<float> tmp2 = block.V256_4;
+            Vector256<float> tmp3 = block.V256_6;
+
+            Vector256<float> z5 = tmp0;
+            Vector256<float> tmp10 = z5 + tmp2;
+            Vector256<float> tmp11 = z5 - tmp2;
+
+            Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
+            Vector256<float> tmp13 = tmp1 + tmp3;
+            Vector256<float> tmp12 = Vector256_.MultiplySubtract(tmp13, tmp1 - tmp3, mm256_F_1_4142);
+
+            tmp0 = tmp10 + tmp13;
+            tmp3 = tmp10 - tmp13;
+            tmp1 = tmp11 + tmp12;
+            tmp2 = tmp11 - tmp12;
+
+            // Odd part
+            Vector256<float> tmp4 = block.V256_1;
+            Vector256<float> tmp5 = block.V256_3;
+            Vector256<float> tmp6 = block.V256_5;
+            Vector256<float> tmp7 = block.V256_7;
+
+            Vector256<float> z13 = tmp6 + tmp5;
+            Vector256<float> z10 = tmp6 - tmp5;
+            Vector256<float> z11 = tmp4 + tmp7;
+            Vector256<float> z12 = tmp4 - tmp7;
+
+            tmp7 = z11 + z13;
+            tmp11 = (z11 - z13) * mm256_F_1_4142;
+
+            z5 = (z10 + z12) * Vector256.Create(1.847759065f);   // mm256_F_1_8477
+
+            tmp10 = Vector256_.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f));   // mm256_F_n1_0823
+            tmp12 = Vector256_.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f));   // mm256_F_n2_6131
+
+            tmp6 = tmp12 - tmp7;
+            tmp5 = tmp11 - tmp6;
+            tmp4 = tmp10 - tmp5;
+
+            block.V256_0 = tmp0 + tmp7;
+            block.V256_7 = tmp0 - tmp7;
+            block.V256_1 = tmp1 + tmp6;
+            block.V256_6 = tmp1 - tmp6;
+            block.V256_2 = tmp2 + tmp5;
+            block.V256_5 = tmp2 - tmp5;
+            block.V256_3 = tmp3 + tmp4;
+            block.V256_4 = tmp3 - tmp4;
+        }
+    }
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
@ -4,7 +4,7 @@
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics;

 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
@ -101,7 +101,7 @@ internal static partial class FloatingPointDCT
    }

    /// <summary>
-    /// Apply 2D floating point IDCT inplace.
+    /// Apply 2D floating point IDCT in place.
    /// </summary>
    /// <remarks>
    /// Input block must be dequantized with quantization table
@ -110,9 +110,9 @@ internal static partial class FloatingPointDCT
    /// <param name="block">Input block.</param>
    public static void TransformIDCT(ref Block8x8F block)
    {
-        if (Avx.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
        {
-            IDCT8x8_Avx(ref block);
+            IDCT8x8_Vector256(ref block);
        }
        else
        {
@ -121,7 +121,7 @@ internal static partial class FloatingPointDCT
    }

    /// <summary>
-    /// Apply 2D floating point IDCT inplace.
+    /// Apply 2D floating point IDCT in place.
    /// </summary>
    /// <remarks>
    /// Input block must be quantized after this method with quantization
@ -130,9 +130,9 @@ internal static partial class FloatingPointDCT
    /// <param name="block">Input block.</param>
    public static void TransformFDCT(ref Block8x8F block)
    {
-        if (Avx.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
        {
-            FDCT8x8_Avx(ref block);
+            FDCT8x8_Vector256(ref block);
        }
        else
        {
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
@ -271,7 +271,7 @@ public class Block8x8Tests : JpegFixture

            Block8x8 block8x8 = Block8x8.Load(Create8x8ShortData());

-            block8x8.TransposeInplace();
+            block8x8.TransposeInPlace();

            short[] actual = new short[64];
            block8x8.CopyTo(actual);
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
@ -60,7 +60,7 @@ internal static partial class LibJpegTools

        internal void MakeBlock(Block8x8 block, int y, int x)
        {
-            block.TransposeInplace();
+            block.TransposeInPlace();
            this.MakeBlock(block.ToArray(), y, x);
        }