From bd9f06f42be1d11df0b5080b04e52e577935aa26 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 28 Sep 2021 23:20:03 +0300
Subject: [PATCH] FDCT sse path via Vector4

---
 .../FastFloatingPointDCT.Intrinsic.cs         |  88 +----------
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 142 ++++++++++++++----
 2 files changed, 114 insertions(+), 116 deletions(-)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index 7d92c34682..f40ae6e874 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -18,11 +18,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
         private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);
 
-        private static readonly Vector128<float> mm128_F_0_7071 = Vector128.Create(0.707106781f);
-        private static readonly Vector128<float> mm128_F_0_3826 = Vector128.Create(0.382683433f);
-        private static readonly Vector128<float> mm128_F_0_5411 = Vector128.Create(0.541196100f);
-        private static readonly Vector128<float> mm128_F_1_3065 = Vector128.Create(1.306562965f);
-
         private static readonly Vector256<float> mm256_F_1_1758 = Vector256.Create(1.175876f);
         private static readonly Vector256<float> mm256_F_n1_9615 = Vector256.Create(-1.961570560f);
         private static readonly Vector256<float> mm256_F_n0_3901 = Vector256.Create(-0.390180644f);
@@ -40,92 +35,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Apply floating point FDCT inplace using simd operations.
         /// </summary>
         /// <param name="block">Input matrix.</param>
-        private static void ForwardTransformSimd(ref Block8x8F block)
+        private static void ForwardTransform_Avx(ref Block8x8F block)
         {
-            DebugGuard.IsTrue(Avx.IsSupported || Sse.IsSupported, "Avx or at least Sse support is required to execute this operation.");
+            DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
 
             // First pass - process rows
             block.TransposeInplace();
-            if (Avx.IsSupported)
-            {
-                FDCT8x8_Avx(ref block);
-            }
-            else
-            {
-                // Left part
-                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
-
-                // Right part
-                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
-            }
+            FDCT8x8_Avx(ref block);
 
             // Second pass - process columns
             block.TransposeInplace();
-            if (Avx.IsSupported)
-            {
-                FDCT8x8_Avx(ref block);
-            }
-            else
-            {
-                // Left part
-                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
-
-                // Right part
-                FDCT8x4_Sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
-            }
-        }
-
-        /// <summary>
-        /// Apply 1D floating point FDCT inplace using SSE operations on 8x4 part of 8x8 matrix.
-        /// </summary>
-        /// <remarks>
-        /// Requires Sse support.
-        /// Must be called on both 8x4 matrix parts for the full FDCT transform.
-        /// </remarks>
-        /// <param name="blockRef">Input reference to the first </param>
-        public static void FDCT8x4_Sse(ref Vector128<float> blockRef)
-        {
-            DebugGuard.IsTrue(Sse.IsSupported, "Sse support is required to execute this operation.");
-
-            Vector128<float> tmp0 = Sse.Add(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14));
-            Vector128<float> tmp7 = Sse.Subtract(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14));
-            Vector128<float> tmp1 = Sse.Add(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12));
-            Vector128<float> tmp6 = Sse.Subtract(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12));
-            Vector128<float> tmp2 = Sse.Add(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10));
-            Vector128<float> tmp5 = Sse.Subtract(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10));
-            Vector128<float> tmp3 = Sse.Add(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8));
-            Vector128<float> tmp4 = Sse.Subtract(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8));
-
-            // Even part
-            Vector128<float> tmp10 = Sse.Add(tmp0, tmp3);
-            Vector128<float> tmp13 = Sse.Subtract(tmp0, tmp3);
-            Vector128<float> tmp11 = Sse.Add(tmp1, tmp2);
-            Vector128<float> tmp12 = Sse.Subtract(tmp1, tmp2);
-
-            Unsafe.Add(ref blockRef, 0) = Sse.Add(tmp10, tmp11);
-            Unsafe.Add(ref blockRef, 8) = Sse.Subtract(tmp10, tmp11);
-
-            Vector128<float> z1 = Sse.Multiply(Sse.Add(tmp12, tmp13), mm128_F_0_7071);
-            Unsafe.Add(ref blockRef, 4) = Sse.Add(tmp13, z1);
-            Unsafe.Add(ref blockRef, 12) = Sse.Subtract(tmp13, z1);
-
-            // Odd part
-            tmp10 = Sse.Add(tmp4, tmp5);
-            tmp11 = Sse.Add(tmp5, tmp6);
-            tmp12 = Sse.Add(tmp6, tmp7);
-
-            Vector128<float> z5 = Sse.Multiply(Sse.Subtract(tmp10, tmp12), mm128_F_0_3826);
-            Vector128<float> z2 = Sse.Add(Sse.Multiply(mm128_F_0_5411, tmp10), z5);
-            Vector128<float> z4 = Sse.Add(Sse.Multiply(mm128_F_1_3065, tmp12), z5);
-            Vector128<float> z3 = Sse.Multiply(tmp11, mm128_F_0_7071);
-
-            Vector128<float> z11 = Sse.Add(tmp7, z3);
-            Vector128<float> z13 = Sse.Subtract(tmp7, z3);
-
-            Unsafe.Add(ref blockRef, 10) = Sse.Add(z13, z2);
-            Unsafe.Add(ref blockRef, 6) = Sse.Subtract(z13, z2);
-            Unsafe.Add(ref blockRef, 2) = Sse.Add(z11, z4);
-            Unsafe.Add(ref blockRef, 14) = Sse.Subtract(z11, z4);
+            FDCT8x8_Avx(ref block);
         }
 
         /// <summary>
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 985dac1bd8..43f6b7a1fc 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -18,30 +18,27 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
     {
 #pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
         private const float C_1_175876 = 1.175875602f;
-
         private const float C_1_961571 = -1.961570560f;
-
         private const float C_0_390181 = -0.390180644f;
-
         private const float C_0_899976 = -0.899976223f;
-
         private const float C_2_562915 = -2.562915447f;
-
         private const float C_0_298631 = 0.298631336f;
-
         private const float C_2_053120 = 2.053119869f;
-
         private const float C_3_072711 = 3.072711026f;
-
         private const float C_1_501321 = 1.501321110f;
-
         private const float C_0_541196 = 0.541196100f;
-
         private const float C_1_847759 = -1.847759065f;
-
         private const float C_0_765367 = 0.765366865f;
 
         private const float C_0_125 = 0.1250f;
+
+#pragma warning disable SA1311, IDE1006 // naming rules violation warnings
+        private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f);
+        private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f);
+        private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f);
+        private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f);
+#pragma warning restore SA1311, IDE1006
+
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
 
         /// <summary>
@@ -80,23 +77,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
         };
 
-        /// <summary>
-        /// Apply floating point IDCT inplace.
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
-        /// </summary>
-        /// <param name="block">Input matrix.</param>
-        /// <param name="temp">Matrix to store temporal results.</param>
-        public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
-        {
-            block.TransposeInplace();
-            IDCT8x8(ref block, ref temp);
-            temp.TransposeInplace();
-            IDCT8x8(ref temp, ref block);
-
-            // TODO: This can be fused into quantization table step
-            block.MultiplyInPlace(C_0_125);
-        }
-
         /// <summary>
         /// Apply 2D floating point FDCT inplace.
         /// </summary>
@@ -104,14 +84,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         public static void TransformFDCT(ref Block8x8F block)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
-            if (Sse.IsSupported)
+            if (Avx.IsSupported)
             {
-                ForwardTransformSimd(ref block);
+                ForwardTransform_Avx(ref block);
             }
             else
 #endif
+            if (Vector.IsHardwareAccelerated)
             {
-                ForwardTransformScalar(ref block);
+                ForwardTransform_Vector4(ref block);
+            }
+            else
+            {
+                ForwardTransform_Scalar(ref block);
             }
         }
 
@@ -122,7 +107,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
         /// </remarks>
         /// <param name="block">Input matrix.</param>
-        private static void ForwardTransformScalar(ref Block8x8F block)
+        private static void ForwardTransform_Scalar(ref Block8x8F block)
         {
             const int dctSize = 8;
 
@@ -225,6 +210,99 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             }
         }
 
+        /// <summary>
+        /// Apply floating point FDCT inplace using <see cref="Vector4"/> API.
+        /// </summary>
+        /// <remarks>
+        /// This implementation must be called only if hardware supports 4
+        /// floating point numbers vector. Otherwise explicit scalar
+        /// implementation <see cref="ForwardTransform_Scalar"/> is faster
+        /// because it does not rely on matrix transposition.
+        /// </remarks>
+        /// <param name="block">Input matrix.</param>
+        private static void ForwardTransform_Vector4(ref Block8x8F block)
+        {
+            DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
+
+            // First pass - process rows
+            block.TransposeInplace();
+            FDCT8x4_Vector4(ref block.V0L);
+            FDCT8x4_Vector4(ref block.V0R);
+
+            // Second pass - process columns
+            block.TransposeInplace();
+            FDCT8x4_Vector4(ref block.V0L);
+            FDCT8x4_Vector4(ref block.V0R);
+        }
+
+        /// <summary>
+        /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix.
+        /// </summary>
+        /// <remarks>
+        /// Implemented using Vector4 API operations for either scalar or sse hardware implementation.
+        /// Must be called on both 8x4 matrix parts for the full FDCT transform.
+        /// </remarks>
+        /// <param name="blockRef">Input reference to the first </param>
+        private static void FDCT8x4_Vector4(ref Vector4 blockRef)
+        {
+            Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14);
+            Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14);
+            Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12);
+            Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12);
+            Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10);
+            Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10);
+            Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8);
+            Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8);
+
+            // Even part
+            Vector4 tmp10 = tmp0 + tmp3;
+            Vector4 tmp13 = tmp0 - tmp3;
+            Vector4 tmp11 = tmp1 + tmp2;
+            Vector4 tmp12 = tmp1 - tmp2;
+
+            Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
+            Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11;
+
+            Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
+            Unsafe.Add(ref blockRef, 4) = tmp13 + z1;
+            Unsafe.Add(ref blockRef, 12) = tmp13 - z1;
+
+            // Odd part
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
+            Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
+            Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
+            Vector4 z3 = tmp11 * mm128_F_0_7071;
+
+            Vector4 z11 = tmp7 + z3;
+            Vector4 z13 = tmp7 - z3;
+
+            Unsafe.Add(ref blockRef, 10) = z13 + z2;
+            Unsafe.Add(ref blockRef, 6) = z13 - z2;
+            Unsafe.Add(ref blockRef, 2) = z11 + z4;
+            Unsafe.Add(ref blockRef, 14) = z11 - z4;
+        }
+
+        /// <summary>
+        /// Apply floating point IDCT inplace.
+        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
+        /// </summary>
+        /// <param name="block">Input matrix.</param>
+        /// <param name="temp">Matrix to store temporal results.</param>
+        public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
+        {
+            block.TransposeInplace();
+            IDCT8x8(ref block, ref temp);
+            temp.TransposeInplace();
+            IDCT8x8(ref temp, ref block);
+
+            // TODO: This can be fused into quantization table step
+            block.MultiplyInPlace(C_0_125);
+        }
+
         /// <summary>
         /// Performs 8x8 matrix Inverse Discrete Cosine Transform
         /// </summary>