diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs index fd3ad8d5ff..369172a2d8 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs @@ -2,6 +2,7 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; #if SUPPORTS_RUNTIME_INTRINSICS @@ -171,14 +172,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components d.V4R = my3 - mb3; } -#if SUPPORTS_RUNTIME_INTRINSICS /// - /// Do IDCT internal operations on the given block. + /// Combined operation of and + /// using AVX commands. /// /// Source /// Destination public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) { +#if SUPPORTS_RUNTIME_INTRINSICS + Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); + Vector256 my1 = s.V1; Vector256 my7 = s.V7; Vector256 mz0 = Avx.Add(my1, my7); @@ -191,40 +195,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758); - if (Fma.IsSupported) - { - mz2 = Fma.MultiplyAdd(mz2, C_V_n1_9615, mz4); - mz3 = Fma.MultiplyAdd(mz3, C_V_n0_3901, mz4); - } - else - { - mz2 = Avx.Add(Avx.Multiply(mz2, C_V_n1_9615), mz4); - mz3 = Avx.Add(Avx.Multiply(mz3, C_V_n0_3901), mz4); - } - + mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615); + mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901); mz0 = Avx.Multiply(mz0, C_V_n0_8999); mz1 = Avx.Multiply(mz1, C_V_n2_5629); + Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2); + Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3); + Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2); + Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3); - Unsafe.SkipInit(out Vector256 mb3); - Unsafe.SkipInit(out Vector256 mb2); - Unsafe.SkipInit(out Vector256 mb1); - Unsafe.SkipInit(out Vector256 mb0); - - if (Fma.IsSupported) - { - mb3 = Avx.Add(Fma.MultiplyAdd(my7, C_V_0_2986, mz0), mz2); - mb2 = Avx.Add(Fma.MultiplyAdd(my5, C_V_2_0531, mz1), mz3); - mb1 = Avx.Add(Fma.MultiplyAdd(my3, C_V_3_0727, mz1), mz2); - mb0 = Avx.Add(Fma.MultiplyAdd(my1, C_V_1_5013, mz0), mz3); - } - else - { - mb3 = Avx.Add(Avx.Add(Avx.Multiply(my7, C_V_0_2986), mz0), mz2); - mb2 = Avx.Add(Avx.Add(Avx.Multiply(my5, C_V_2_0531), mz1), mz3); - mb1 = Avx.Add(Avx.Add(Avx.Multiply(my3, C_V_3_0727), mz1), mz2); - mb0 = Avx.Add(Avx.Add(Avx.Multiply(my1, C_V_1_5013), mz0), mz3); - } Vector256 my2 = s.V2; Vector256 my6 = s.V6; @@ -233,17 +213,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components Vector256 my4 = s.V4; mz0 = Avx.Add(my0, my4); mz1 = Avx.Subtract(my0, my4); - - if (Fma.IsSupported) - { - mz2 = Fma.MultiplyAdd(my6, C_V_n1_8477, mz4); - mz3 = Fma.MultiplyAdd(my2, C_V_0_7653, mz4); - } - else - { - mz2 = Avx.Add(Avx.Multiply(my6, C_V_n1_8477), mz4); - mz3 = Avx.Add(Avx.Multiply(my2, C_V_0_7653), mz4); - } + mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477); + mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653); my0 = Avx.Add(mz0, mz3); my3 = Avx.Subtract(mz0, mz3); @@ -258,7 +229,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components d.V5 = Avx.Subtract(my2, mb2); d.V3 = Avx.Add(my3, mb3); d.V4 = Avx.Subtract(my3, mb3); - } #endif + } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 4ef4ab7b0b..493c0a6880 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -1,6 +1,7 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; #if SUPPORTS_RUNTIME_INTRINSICS @@ -196,14 +197,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components d.V7R = c0 - c3; } -#if SUPPORTS_RUNTIME_INTRINSICS /// - /// + /// Combined operation of and + /// using AVX commands. /// /// Source /// Destination private static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) { +#if SUPPORTS_RUNTIME_INTRINSICS + Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); + Vector256 t0 = Avx.Add(s.V0, s.V7); Vector256 t7 = Avx.Subtract(s.V0, s.V7); Vector256 t1 = Avx.Add(s.V1, s.V6); @@ -224,36 +228,33 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components Vector256 c2 = Avx.Subtract(t1, t2); // 2 6 + d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065); if (Fma.IsSupported) { - d.V2 = Fma.MultiplyAdd(c2, C_V_0_5411, Avx.Multiply(c3, C_V_1_3065)); d.V6 = Fma.MultiplySubtract(c3, C_V_0_5411, Avx.Multiply(c2, C_V_1_3065)); } else { - d.V2 = Avx.Add(Avx.Multiply(c2, C_V_0_5411), Avx.Multiply(c3, C_V_1_3065)); d.V6 = Avx.Subtract(Avx.Multiply(c3, C_V_0_5411), Avx.Multiply(c2, C_V_1_3065)); } + c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856); if (Fma.IsSupported) { - c3 = Fma.MultiplyAdd(t4, C_V_1_1758, Avx.Multiply(t7, C_V_0_7856)); c0 = Fma.MultiplySubtract(t7, C_V_1_1758, Avx.Multiply(t4, C_V_0_7856)); } else { - c3 = Avx.Add(Avx.Multiply(t4, C_V_1_1758), Avx.Multiply(t7, C_V_0_7856)); c0 = Avx.Subtract(Avx.Multiply(t7, C_V_1_1758), Avx.Multiply(t4, C_V_0_7856)); } + c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6); if (Fma.IsSupported) { - c2 = Fma.MultiplyAdd(t5, C_V_1_3870, Avx.Multiply(C_V_0_2758, t6)); c1 = Fma.MultiplySubtract(t6, C_V_1_3870, Avx.Multiply(C_V_0_2758, t5)); } else { - c2 = Avx.Add(Avx.Multiply(t5, C_V_1_3870), Avx.Multiply(C_V_0_2758, t6)); c1 = Avx.Subtract(Avx.Multiply(t6, C_V_1_3870), Avx.Multiply(C_V_0_2758, t5)); } @@ -267,8 +268,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components // 1 7 d.V1 = Avx.Add(c0, c3); d.V7 = Avx.Subtract(c0, c3); - } #endif + } /// /// Performs 8x8 matrix Forward Discrete Cosine Transform