From e5188fe4f4b2060ed3329d696d4efb16bb7a51ca Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Tue, 18 May 2021 12:56:53 +0300
Subject: [PATCH] Implemented FDCT8x8 using avx instruction set, added backward
 compatibility for FDCT8x4 calls using FDCT8x8(ref Block8x8F, ref Block8x8F)
 method

---
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 120 +++++++++++++++++-
 1 file changed, 114 insertions(+), 6 deletions(-)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index a6d0622dd8..ad47aa05fb 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -3,6 +3,10 @@
 
 using System.Numerics;
 using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
@@ -38,6 +42,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         private const float C_0_765367 = 0.765366865f;
 
         private const float C_0_125 = 0.1250f;
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector256<float> C_V_0_5411 = Vector256.Create(0.541196f);
+        private static readonly Vector256<float> C_V_1_3065 = Vector256.Create(1.306563f);
+        private static readonly Vector256<float> C_V_1_1758 = Vector256.Create(1.175876f);
+        private static readonly Vector256<float> C_V_0_7856 = Vector256.Create(0.785695f);
+        private static readonly Vector256<float> C_V_1_3870 = Vector256.Create(1.387040f);
+        private static readonly Vector256<float> C_V_0_2758 = Vector256.Create(0.275899f);
+
+        private static Vector256<float> C_V_InvSqrt2 = Vector256.Create(0.707107f);
+#endif
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
         private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f);
 
@@ -308,12 +323,107 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             d.V7R = c0 - c3;
         }
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        /// <summary>
+        /// 
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        private static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector256<float> t0 = Avx.Add(s.V0, s.V7);
+            Vector256<float> t7 = Avx.Subtract(s.V0, s.V7);
+            Vector256<float> t1 = Avx.Add(s.V1, s.V6);
+            Vector256<float> t6 = Avx.Subtract(s.V1, s.V6);
+            Vector256<float> t2 = Avx.Add(s.V2, s.V5);
+            Vector256<float> t5 = Avx.Subtract(s.V2, s.V5);
+            Vector256<float> t3 = Avx.Add(s.V3, s.V4);
+            Vector256<float> t4 = Avx.Subtract(s.V3, s.V4);
+
+            Vector256<float> c0 = Avx.Add(t0, t3);
+            Vector256<float> c1 = Avx.Add(t1, t2);
+
+            // 0 4
+            d.V0 = Avx.Add(c0, c1);
+            d.V4 = Avx.Subtract(c0, c1);
+
+            Vector256<float> c3 = Avx.Subtract(t0, t3);
+            Vector256<float> c2 = Avx.Subtract(t1, t2);
+
+            // 2 6
+            if (Fma.IsSupported)
+            {
+                d.V2 = Fma.MultiplyAdd(c2, C_V_0_5411, Avx.Multiply(c3, C_V_1_3065));
+                d.V6 = Fma.MultiplySubtract(c3, C_V_0_5411, Avx.Multiply(c2, C_V_1_3065));
+            }
+            else
+            {
+                d.V2 = Avx.Add(Avx.Multiply(c2, C_V_0_5411), Avx.Multiply(c3, C_V_1_3065));
+                d.V6 = Avx.Subtract(Avx.Multiply(c3, C_V_0_5411), Avx.Multiply(c2, C_V_1_3065));
+            }
+
+            if (Fma.IsSupported)
+            {
+                c3 = Fma.MultiplyAdd(t4, C_V_1_1758, Avx.Multiply(t7, C_V_0_7856));
+                c0 = Fma.MultiplySubtract(t7, C_V_1_1758, Avx.Multiply(t4, C_V_0_7856));
+            }
+            else
+            {
+                c3 = Avx.Add(Avx.Multiply(t4, C_V_1_1758), Avx.Multiply(t7, C_V_0_7856));
+                c0 = Avx.Subtract(Avx.Multiply(t7, C_V_1_1758), Avx.Multiply(t4, C_V_0_7856));
+            }
+
+            if (Fma.IsSupported)
+            {
+                c2 = Fma.MultiplyAdd(t5, C_V_1_3870, Avx.Multiply(C_V_0_2758, t6));
+                c1 = Fma.MultiplySubtract(t6, C_V_1_3870, Avx.Multiply(C_V_0_2758, t5));
+            }
+            else
+            {
+                c2 = Avx.Add(Avx.Multiply(t5, C_V_1_3870), Avx.Multiply(C_V_0_2758, t6));
+                c1 = Avx.Subtract(Avx.Multiply(t6, C_V_1_3870), Avx.Multiply(C_V_0_2758, t5));
+            }
+
+            // 3 5
+            d.V3 = Avx.Subtract(c0, c2);
+            d.V5 = Avx.Subtract(c3, c1);
+
+            c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2);
+            c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2);
+
+            // 1 7
+            d.V1 = Avx.Add(c0, c3);
+            d.V7 = Avx.Subtract(c0, c3);
+        }
+#endif
+
         /// <summary>
-        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization)
+        /// Performs 8x8 matrix Forward Discrete Cosine Transform
         /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported)
+            {
+                FDCT8x8_Avx(ref s, ref d);
+            }
+            else
+#endif
+            {
+                FDCT8x4_LeftPart(ref s, ref d);
+                FDCT8x4_RightPart(ref s, ref d);
+            }
+        }
+
+        /// <summary>
+        /// Apply floating point FDCT from src into dest
+        /// </summary>
+        /// <remarks></remarks>
         /// <param name="src">Source</param>
         /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller</param>
+        /// <param name="temp">Temporary block provided by the caller for optimization</param>
         /// <param name="offsetSourceByNeg128">If true, a constant -128.0 offset is applied for all values before FDCT </param>
         public static void TransformFDCT(
             ref Block8x8F src,
@@ -327,13 +437,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                 temp.AddInPlace(-128F);
             }
 
-            FDCT8x4_LeftPart(ref temp, ref dest);
-            FDCT8x4_RightPart(ref temp, ref dest);
+            FDCT8x8(ref temp, ref dest);
 
             dest.TransposeInto(ref temp);
 
-            FDCT8x4_LeftPart(ref temp, ref dest);
-            FDCT8x4_RightPart(ref temp, ref dest);
+            FDCT8x8(ref temp, ref dest);
 
             dest.MultiplyInPlace(C_0_125);
         }