From e5188fe4f4b2060ed3329d696d4efb16bb7a51ca Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 18 May 2021 12:56:53 +0300 Subject: [PATCH] Implemented FDCT8x8 using avx instruction set, added backward compatibility for FDCT8x4 calls using FDCT8x8(ref Block8x8F, ref Block8x8F) method --- .../Jpeg/Components/FastFloatingPointDCT.cs | 120 +++++++++++++++++- 1 file changed, 114 insertions(+), 6 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index a6d0622dd8..ad47aa05fb 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -3,6 +3,10 @@ using System.Numerics; using System.Runtime.CompilerServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Jpeg.Components @@ -38,6 +42,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private const float C_0_765367 = 0.765366865f; private const float C_0_125 = 0.1250f; + +#if SUPPORTS_RUNTIME_INTRINSICS + private static readonly Vector256 C_V_0_5411 = Vector256.Create(0.541196f); + private static readonly Vector256 C_V_1_3065 = Vector256.Create(1.306563f); + private static readonly Vector256 C_V_1_1758 = Vector256.Create(1.175876f); + private static readonly Vector256 C_V_0_7856 = Vector256.Create(0.785695f); + private static readonly Vector256 C_V_1_3870 = Vector256.Create(1.387040f); + private static readonly Vector256 C_V_0_2758 = Vector256.Create(0.275899f); + + private static Vector256 C_V_InvSqrt2 = Vector256.Create(0.707107f); +#endif #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f); @@ -308,12 +323,107 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components d.V7R = c0 - c3; } +#if SUPPORTS_RUNTIME_INTRINSICS + /// + /// + /// + /// Source + /// Destination + private static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) + { + Vector256 t0 = Avx.Add(s.V0, s.V7); + Vector256 t7 = Avx.Subtract(s.V0, s.V7); + Vector256 t1 = Avx.Add(s.V1, s.V6); + Vector256 t6 = Avx.Subtract(s.V1, s.V6); + Vector256 t2 = Avx.Add(s.V2, s.V5); + Vector256 t5 = Avx.Subtract(s.V2, s.V5); + Vector256 t3 = Avx.Add(s.V3, s.V4); + Vector256 t4 = Avx.Subtract(s.V3, s.V4); + + Vector256 c0 = Avx.Add(t0, t3); + Vector256 c1 = Avx.Add(t1, t2); + + // 0 4 + d.V0 = Avx.Add(c0, c1); + d.V4 = Avx.Subtract(c0, c1); + + Vector256 c3 = Avx.Subtract(t0, t3); + Vector256 c2 = Avx.Subtract(t1, t2); + + // 2 6 + if (Fma.IsSupported) + { + d.V2 = Fma.MultiplyAdd(c2, C_V_0_5411, Avx.Multiply(c3, C_V_1_3065)); + d.V6 = Fma.MultiplySubtract(c3, C_V_0_5411, Avx.Multiply(c2, C_V_1_3065)); + } + else + { + d.V2 = Avx.Add(Avx.Multiply(c2, C_V_0_5411), Avx.Multiply(c3, C_V_1_3065)); + d.V6 = Avx.Subtract(Avx.Multiply(c3, C_V_0_5411), Avx.Multiply(c2, C_V_1_3065)); + } + + if (Fma.IsSupported) + { + c3 = Fma.MultiplyAdd(t4, C_V_1_1758, Avx.Multiply(t7, C_V_0_7856)); + c0 = Fma.MultiplySubtract(t7, C_V_1_1758, Avx.Multiply(t4, C_V_0_7856)); + } + else + { + c3 = Avx.Add(Avx.Multiply(t4, C_V_1_1758), Avx.Multiply(t7, C_V_0_7856)); + c0 = Avx.Subtract(Avx.Multiply(t7, C_V_1_1758), Avx.Multiply(t4, C_V_0_7856)); + } + + if (Fma.IsSupported) + { + c2 = Fma.MultiplyAdd(t5, C_V_1_3870, Avx.Multiply(C_V_0_2758, t6)); + c1 = Fma.MultiplySubtract(t6, C_V_1_3870, Avx.Multiply(C_V_0_2758, t5)); + } + else + { + c2 = Avx.Add(Avx.Multiply(t5, C_V_1_3870), Avx.Multiply(C_V_0_2758, t6)); + c1 = Avx.Subtract(Avx.Multiply(t6, C_V_1_3870), Avx.Multiply(C_V_0_2758, t5)); + } + + // 3 5 + d.V3 = Avx.Subtract(c0, c2); + d.V5 = Avx.Subtract(c3, c1); + + c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2); + c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2); + + // 1 7 + d.V1 = Avx.Add(c0, c3); + d.V7 = Avx.Subtract(c0, c3); + } +#endif + /// - /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization) + /// Performs 8x8 matrix Forward Discrete Cosine Transform /// + /// Source + /// Destination + public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported) + { + FDCT8x8_Avx(ref s, ref d); + } + else +#endif + { + FDCT8x4_LeftPart(ref s, ref d); + FDCT8x4_RightPart(ref s, ref d); + } + } + + /// + /// Apply floating point FDCT from src into dest + /// + /// /// Source /// Destination - /// Temporary block provided by the caller + /// Temporary block provided by the caller for optimization /// If true, a constant -128.0 offset is applied for all values before FDCT public static void TransformFDCT( ref Block8x8F src, @@ -327,13 +437,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components temp.AddInPlace(-128F); } - FDCT8x4_LeftPart(ref temp, ref dest); - FDCT8x4_RightPart(ref temp, ref dest); + FDCT8x8(ref temp, ref dest); dest.TransposeInto(ref temp); - FDCT8x4_LeftPart(ref temp, ref dest); - FDCT8x4_RightPart(ref temp, ref dest); + FDCT8x8(ref temp, ref dest); dest.MultiplyInPlace(C_0_125); }