diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
index fd3ad8d5ff..369172a2d8 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
@@ -2,6 +2,7 @@
// Licensed under the Apache License, Version 2.0.
using System;
+using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
#if SUPPORTS_RUNTIME_INTRINSICS
@@ -171,14 +172,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
d.V4R = my3 - mb3;
}
-#if SUPPORTS_RUNTIME_INTRINSICS
///
- /// Do IDCT internal operations on the given block.
+ /// Combined operation of and
+ /// using AVX commands.
///
/// Source
/// Destination
public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
{
+#if SUPPORTS_RUNTIME_INTRINSICS
+ Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
+
Vector256 my1 = s.V1;
Vector256 my7 = s.V7;
Vector256 mz0 = Avx.Add(my1, my7);
@@ -191,40 +195,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758);
- if (Fma.IsSupported)
- {
- mz2 = Fma.MultiplyAdd(mz2, C_V_n1_9615, mz4);
- mz3 = Fma.MultiplyAdd(mz3, C_V_n0_3901, mz4);
- }
- else
- {
- mz2 = Avx.Add(Avx.Multiply(mz2, C_V_n1_9615), mz4);
- mz3 = Avx.Add(Avx.Multiply(mz3, C_V_n0_3901), mz4);
- }
-
+ mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615);
+ mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901);
mz0 = Avx.Multiply(mz0, C_V_n0_8999);
mz1 = Avx.Multiply(mz1, C_V_n2_5629);
+ Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2);
+ Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3);
+ Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2);
+ Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3);
- Unsafe.SkipInit(out Vector256 mb3);
- Unsafe.SkipInit(out Vector256 mb2);
- Unsafe.SkipInit(out Vector256 mb1);
- Unsafe.SkipInit(out Vector256 mb0);
-
- if (Fma.IsSupported)
- {
- mb3 = Avx.Add(Fma.MultiplyAdd(my7, C_V_0_2986, mz0), mz2);
- mb2 = Avx.Add(Fma.MultiplyAdd(my5, C_V_2_0531, mz1), mz3);
- mb1 = Avx.Add(Fma.MultiplyAdd(my3, C_V_3_0727, mz1), mz2);
- mb0 = Avx.Add(Fma.MultiplyAdd(my1, C_V_1_5013, mz0), mz3);
- }
- else
- {
- mb3 = Avx.Add(Avx.Add(Avx.Multiply(my7, C_V_0_2986), mz0), mz2);
- mb2 = Avx.Add(Avx.Add(Avx.Multiply(my5, C_V_2_0531), mz1), mz3);
- mb1 = Avx.Add(Avx.Add(Avx.Multiply(my3, C_V_3_0727), mz1), mz2);
- mb0 = Avx.Add(Avx.Add(Avx.Multiply(my1, C_V_1_5013), mz0), mz3);
- }
Vector256 my2 = s.V2;
Vector256 my6 = s.V6;
@@ -233,17 +213,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
Vector256 my4 = s.V4;
mz0 = Avx.Add(my0, my4);
mz1 = Avx.Subtract(my0, my4);
-
- if (Fma.IsSupported)
- {
- mz2 = Fma.MultiplyAdd(my6, C_V_n1_8477, mz4);
- mz3 = Fma.MultiplyAdd(my2, C_V_0_7653, mz4);
- }
- else
- {
- mz2 = Avx.Add(Avx.Multiply(my6, C_V_n1_8477), mz4);
- mz3 = Avx.Add(Avx.Multiply(my2, C_V_0_7653), mz4);
- }
+ mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477);
+ mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653);
my0 = Avx.Add(mz0, mz3);
my3 = Avx.Subtract(mz0, mz3);
@@ -258,7 +229,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
d.V5 = Avx.Subtract(my2, mb2);
d.V3 = Avx.Add(my3, mb3);
d.V4 = Avx.Subtract(my3, mb3);
- }
#endif
+ }
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 4ef4ab7b0b..493c0a6880 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -1,6 +1,7 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
+using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
#if SUPPORTS_RUNTIME_INTRINSICS
@@ -196,14 +197,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
d.V7R = c0 - c3;
}
-#if SUPPORTS_RUNTIME_INTRINSICS
///
- ///
+ /// Combined operation of and
+ /// using AVX commands.
///
/// Source
/// Destination
private static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
{
+#if SUPPORTS_RUNTIME_INTRINSICS
+ Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
+
Vector256 t0 = Avx.Add(s.V0, s.V7);
Vector256 t7 = Avx.Subtract(s.V0, s.V7);
Vector256 t1 = Avx.Add(s.V1, s.V6);
@@ -224,36 +228,33 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
Vector256 c2 = Avx.Subtract(t1, t2);
// 2 6
+ d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065);
if (Fma.IsSupported)
{
- d.V2 = Fma.MultiplyAdd(c2, C_V_0_5411, Avx.Multiply(c3, C_V_1_3065));
d.V6 = Fma.MultiplySubtract(c3, C_V_0_5411, Avx.Multiply(c2, C_V_1_3065));
}
else
{
- d.V2 = Avx.Add(Avx.Multiply(c2, C_V_0_5411), Avx.Multiply(c3, C_V_1_3065));
d.V6 = Avx.Subtract(Avx.Multiply(c3, C_V_0_5411), Avx.Multiply(c2, C_V_1_3065));
}
+ c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856);
if (Fma.IsSupported)
{
- c3 = Fma.MultiplyAdd(t4, C_V_1_1758, Avx.Multiply(t7, C_V_0_7856));
c0 = Fma.MultiplySubtract(t7, C_V_1_1758, Avx.Multiply(t4, C_V_0_7856));
}
else
{
- c3 = Avx.Add(Avx.Multiply(t4, C_V_1_1758), Avx.Multiply(t7, C_V_0_7856));
c0 = Avx.Subtract(Avx.Multiply(t7, C_V_1_1758), Avx.Multiply(t4, C_V_0_7856));
}
+ c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6);
if (Fma.IsSupported)
{
- c2 = Fma.MultiplyAdd(t5, C_V_1_3870, Avx.Multiply(C_V_0_2758, t6));
c1 = Fma.MultiplySubtract(t6, C_V_1_3870, Avx.Multiply(C_V_0_2758, t5));
}
else
{
- c2 = Avx.Add(Avx.Multiply(t5, C_V_1_3870), Avx.Multiply(C_V_0_2758, t6));
c1 = Avx.Subtract(Avx.Multiply(t6, C_V_1_3870), Avx.Multiply(C_V_0_2758, t5));
}
@@ -267,8 +268,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
// 1 7
d.V1 = Avx.Add(c0, c3);
d.V7 = Avx.Subtract(c0, c3);
- }
#endif
+ }
///
/// Performs 8x8 matrix Forward Discrete Cosine Transform