diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 8533b21518..e155e45361 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -619,29 +619,6 @@ internal static partial class SimdUtils
return va + (vm0 * vm1);
}
- ///
- /// Performs a multiplication and a subtraction of the .
- /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
- ///
- /// ret = (vm0 * vm1) - vs
- /// The vector to subtract from the intermediate result.
- /// The first vector to multiply.
- /// The second vector to multiply.
- /// The .
- [MethodImpl(InliningOptions.ShortMethod)]
- public static Vector256 MultiplySubtract(
- Vector256 vs,
- Vector256 vm0,
- Vector256 vm1)
- {
- if (Fma.IsSupported)
- {
- return Fma.MultiplySubtract(vm1, vm0, vs);
- }
-
- return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
- }
-
///
/// Performs a multiplication and a negated addition of the .
///
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
index 8b22a51379..c835d267d8 100644
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -140,6 +140,28 @@ internal static class Vector256_
return va + (vm0 * vm1);
}
+ ///
+ /// Performs a multiplication and a subtraction of the .
+ ///
+ /// ret = (vm0 * vm1) - vs
+ /// The vector to subtract from the intermediate result.
+ /// The first vector to multiply.
+ /// The second vector to multiply.
+ /// The .
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector256 MultiplySubtract(
+ Vector256 vs,
+ Vector256 vm0,
+ Vector256 vm1)
+ {
+ if (Fma.IsSupported)
+ {
+ return Fma.MultiplySubtract(vm1, vm0, vs);
+ }
+
+ return (vm0 * vm1) - vs;
+ }
+
///
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
///
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index 01d112bd6f..731ad0f765 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -211,10 +211,10 @@ internal partial struct Block8x8
}
///
- /// Transpose the block inplace.
+ /// Transpose the block in place.
///
[MethodImpl(InliningOptions.ShortMethod)]
- public void TransposeInplace()
+ public void TransposeInPlace()
{
ref short elemRef = ref Unsafe.As(ref this);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
deleted file mode 100644
index 862c774699..0000000000
--- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Six Labors Split License.
-
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-
-namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
-
-internal static partial class FloatingPointDCT
-{
- ///
- /// Apply floating point FDCT inplace using simd operations.
- ///
- /// Input block.
- private static void FDCT8x8_Avx(ref Block8x8F block)
- {
- DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
-
- // First pass - process columns
- FDCT8x8_1D_Avx(ref block);
-
- // Second pass - process rows
- block.TransposeInPlace();
- FDCT8x8_1D_Avx(ref block);
-
- // Applies 1D floating point FDCT inplace
- static void FDCT8x8_1D_Avx(ref Block8x8F block)
- {
- Vector256 tmp0 = Avx.Add(block.V256_0, block.V256_7);
- Vector256 tmp7 = Avx.Subtract(block.V256_0, block.V256_7);
- Vector256 tmp1 = Avx.Add(block.V256_1, block.V256_6);
- Vector256 tmp6 = Avx.Subtract(block.V256_1, block.V256_6);
- Vector256 tmp2 = Avx.Add(block.V256_2, block.V256_5);
- Vector256 tmp5 = Avx.Subtract(block.V256_2, block.V256_5);
- Vector256 tmp3 = Avx.Add(block.V256_3, block.V256_4);
- Vector256 tmp4 = Avx.Subtract(block.V256_3, block.V256_4);
-
- // Even part
- Vector256 tmp10 = Avx.Add(tmp0, tmp3);
- Vector256 tmp13 = Avx.Subtract(tmp0, tmp3);
- Vector256 tmp11 = Avx.Add(tmp1, tmp2);
- Vector256 tmp12 = Avx.Subtract(tmp1, tmp2);
-
- block.V256_0 = Avx.Add(tmp10, tmp11);
- block.V256_4 = Avx.Subtract(tmp10, tmp11);
-
- var mm256_F_0_7071 = Vector256.Create(0.707106781f);
- Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
- block.V256_2 = Avx.Add(tmp13, z1);
- block.V256_6 = Avx.Subtract(tmp13, z1);
-
- // Odd part
- tmp10 = Avx.Add(tmp4, tmp5);
- tmp11 = Avx.Add(tmp5, tmp6);
- tmp12 = Avx.Add(tmp6, tmp7);
-
- Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), Vector256.Create(0.382683433f)); // mm256_F_0_3826
- Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411
- Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065
- Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
-
- Vector256 z11 = Avx.Add(tmp7, z3);
- Vector256 z13 = Avx.Subtract(tmp7, z3);
-
- block.V256_5 = Avx.Add(z13, z2);
- block.V256_3 = Avx.Subtract(z13, z2);
- block.V256_1 = Avx.Add(z11, z4);
- block.V256_7 = Avx.Subtract(z11, z4);
- }
- }
-
- ///
- /// Apply floating point IDCT inplace using simd operations.
- ///
- /// Transposed input block.
- private static void IDCT8x8_Avx(ref Block8x8F transposedBlock)
- {
- DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
-
- // First pass - process columns
- IDCT8x8_1D_Avx(ref transposedBlock);
-
- // Second pass - process rows
- transposedBlock.TransposeInPlace();
- IDCT8x8_1D_Avx(ref transposedBlock);
-
- // Applies 1D floating point FDCT inplace
- static void IDCT8x8_1D_Avx(ref Block8x8F block)
- {
- // Even part
- Vector256 tmp0 = block.V256_0;
- Vector256 tmp1 = block.V256_2;
- Vector256 tmp2 = block.V256_4;
- Vector256 tmp3 = block.V256_6;
-
- Vector256 z5 = tmp0;
- Vector256 tmp10 = Avx.Add(z5, tmp2);
- Vector256 tmp11 = Avx.Subtract(z5, tmp2);
-
- var mm256_F_1_4142 = Vector256.Create(1.414213562f);
- Vector256 tmp13 = Avx.Add(tmp1, tmp3);
- Vector256 tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
-
- tmp0 = Avx.Add(tmp10, tmp13);
- tmp3 = Avx.Subtract(tmp10, tmp13);
- tmp1 = Avx.Add(tmp11, tmp12);
- tmp2 = Avx.Subtract(tmp11, tmp12);
-
- // Odd part
- Vector256 tmp4 = block.V256_1;
- Vector256 tmp5 = block.V256_3;
- Vector256 tmp6 = block.V256_5;
- Vector256 tmp7 = block.V256_7;
-
- Vector256 z13 = Avx.Add(tmp6, tmp5);
- Vector256 z10 = Avx.Subtract(tmp6, tmp5);
- Vector256 z11 = Avx.Add(tmp4, tmp7);
- Vector256 z12 = Avx.Subtract(tmp4, tmp7);
-
- tmp7 = Avx.Add(z11, z13);
- tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);
-
- z5 = Avx.Multiply(Avx.Add(z10, z12), Vector256.Create(1.847759065f)); // mm256_F_1_8477
-
- tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823
- tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131
-
- tmp6 = Avx.Subtract(tmp12, tmp7);
- tmp5 = Avx.Subtract(tmp11, tmp6);
- tmp4 = Avx.Subtract(tmp10, tmp5);
-
- block.V256_0 = Avx.Add(tmp0, tmp7);
- block.V256_7 = Avx.Subtract(tmp0, tmp7);
- block.V256_1 = Avx.Add(tmp1, tmp6);
- block.V256_6 = Avx.Subtract(tmp1, tmp6);
- block.V256_2 = Avx.Add(tmp2, tmp5);
- block.V256_5 = Avx.Subtract(tmp2, tmp5);
- block.V256_3 = Avx.Add(tmp3, tmp4);
- block.V256_4 = Avx.Subtract(tmp3, tmp4);
- }
- }
-}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs
new file mode 100644
index 0000000000..bcd8c70431
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs
@@ -0,0 +1,142 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.Intrinsics;
+using SixLabors.ImageSharp.Common.Helpers;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal static partial class FloatingPointDCT
+{
+ ///
+ /// Apply floating point FDCT in place using simd operations.
+ ///
+ /// Input block.
+ private static void FDCT8x8_Vector256(ref Block8x8F block)
+ {
+ DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
+
+ // First pass - process columns
+ FDCT8x8_1D_Vector256(ref block);
+
+ // Second pass - process rows
+ block.TransposeInPlace();
+ FDCT8x8_1D_Vector256(ref block);
+
+ // Applies 1D floating point FDCT in place
+ static void FDCT8x8_1D_Vector256(ref Block8x8F block)
+ {
+ Vector256 tmp0 = block.V256_0 + block.V256_7;
+ Vector256 tmp7 = block.V256_0 - block.V256_7;
+ Vector256 tmp1 = block.V256_1 + block.V256_6;
+ Vector256 tmp6 = block.V256_1 - block.V256_6;
+ Vector256 tmp2 = block.V256_2 + block.V256_5;
+ Vector256 tmp5 = block.V256_2 - block.V256_5;
+ Vector256 tmp3 = block.V256_3 + block.V256_4;
+ Vector256 tmp4 = block.V256_3 - block.V256_4;
+
+ // Even part
+ Vector256 tmp10 = tmp0 + tmp3;
+ Vector256 tmp13 = tmp0 - tmp3;
+ Vector256 tmp11 = tmp1 + tmp2;
+ Vector256 tmp12 = tmp1 - tmp2;
+
+ block.V256_0 = tmp10 + tmp11;
+ block.V256_4 = tmp10 - tmp11;
+
+ Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f);
+ Vector256 z1 = (tmp12 + tmp13) * mm256_F_0_7071;
+ block.V256_2 = tmp13 + z1;
+ block.V256_6 = tmp13 - z1;
+
+ // Odd part
+ tmp10 = tmp4 + tmp5;
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ Vector256 z5 = (tmp10 - tmp12) * Vector256.Create(0.382683433f); // mm256_F_0_3826
+ Vector256 z2 = Vector256_.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411
+ Vector256 z4 = Vector256_.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065
+ Vector256 z3 = tmp11 * mm256_F_0_7071;
+
+ Vector256 z11 = tmp7 + z3;
+ Vector256 z13 = tmp7 - z3;
+
+ block.V256_5 = z13 + z2;
+ block.V256_3 = z13 - z2;
+ block.V256_1 = z11 + z4;
+ block.V256_7 = z11 - z4;
+ }
+ }
+
+ ///
+ /// Apply floating point IDCT in place using simd operations.
+ ///
+ /// Transposed input block.
+ private static void IDCT8x8_Vector256(ref Block8x8F transposedBlock)
+ {
+ DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
+
+ // First pass - process columns
+ IDCT8x8_1D_Vector256(ref transposedBlock);
+
+ // Second pass - process rows
+ transposedBlock.TransposeInPlace();
+ IDCT8x8_1D_Vector256(ref transposedBlock);
+
+ // Applies 1D floating point FDCT in place
+ static void IDCT8x8_1D_Vector256(ref Block8x8F block)
+ {
+ // Even part
+ Vector256 tmp0 = block.V256_0;
+ Vector256 tmp1 = block.V256_2;
+ Vector256 tmp2 = block.V256_4;
+ Vector256 tmp3 = block.V256_6;
+
+ Vector256 z5 = tmp0;
+ Vector256 tmp10 = z5 + tmp2;
+ Vector256 tmp11 = z5 - tmp2;
+
+ Vector256 mm256_F_1_4142 = Vector256.Create(1.414213562f);
+ Vector256 tmp13 = tmp1 + tmp3;
+ Vector256 tmp12 = Vector256_.MultiplySubtract(tmp13, tmp1 - tmp3, mm256_F_1_4142);
+
+ tmp0 = tmp10 + tmp13;
+ tmp3 = tmp10 - tmp13;
+ tmp1 = tmp11 + tmp12;
+ tmp2 = tmp11 - tmp12;
+
+ // Odd part
+ Vector256 tmp4 = block.V256_1;
+ Vector256 tmp5 = block.V256_3;
+ Vector256 tmp6 = block.V256_5;
+ Vector256 tmp7 = block.V256_7;
+
+ Vector256 z13 = tmp6 + tmp5;
+ Vector256 z10 = tmp6 - tmp5;
+ Vector256 z11 = tmp4 + tmp7;
+ Vector256 z12 = tmp4 - tmp7;
+
+ tmp7 = z11 + z13;
+ tmp11 = (z11 - z13) * mm256_F_1_4142;
+
+ z5 = (z10 + z12) * Vector256.Create(1.847759065f); // mm256_F_1_8477
+
+ tmp10 = Vector256_.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823
+ tmp12 = Vector256_.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131
+
+ tmp6 = tmp12 - tmp7;
+ tmp5 = tmp11 - tmp6;
+ tmp4 = tmp10 - tmp5;
+
+ block.V256_0 = tmp0 + tmp7;
+ block.V256_7 = tmp0 - tmp7;
+ block.V256_1 = tmp1 + tmp6;
+ block.V256_6 = tmp1 - tmp6;
+ block.V256_2 = tmp2 + tmp5;
+ block.V256_5 = tmp2 - tmp5;
+ block.V256_3 = tmp3 + tmp4;
+ block.V256_4 = tmp3 - tmp4;
+ }
+ }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
index 4c22307cfe..8122d8daa6 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
@@ -4,7 +4,7 @@
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics;
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
@@ -101,7 +101,7 @@ internal static partial class FloatingPointDCT
}
///
- /// Apply 2D floating point IDCT inplace.
+ /// Apply 2D floating point IDCT in place.
///
///
/// Input block must be dequantized with quantization table
@@ -110,9 +110,9 @@ internal static partial class FloatingPointDCT
/// Input block.
public static void TransformIDCT(ref Block8x8F block)
{
- if (Avx.IsSupported)
+ if (Vector256.IsHardwareAccelerated)
{
- IDCT8x8_Avx(ref block);
+ IDCT8x8_Vector256(ref block);
}
else
{
@@ -121,7 +121,7 @@ internal static partial class FloatingPointDCT
}
///
- /// Apply 2D floating point IDCT inplace.
+ /// Apply 2D floating point IDCT in place.
///
///
/// Input block must be quantized after this method with quantization
@@ -130,9 +130,9 @@ internal static partial class FloatingPointDCT
/// Input block.
public static void TransformFDCT(ref Block8x8F block)
{
- if (Avx.IsSupported)
+ if (Vector256.IsHardwareAccelerated)
{
- FDCT8x8_Avx(ref block);
+ FDCT8x8_Vector256(ref block);
}
else
{
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
index b5d364dd38..cb8f52a96f 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
@@ -271,7 +271,7 @@ public class Block8x8Tests : JpegFixture
Block8x8 block8x8 = Block8x8.Load(Create8x8ShortData());
- block8x8.TransposeInplace();
+ block8x8.TransposeInPlace();
short[] actual = new short[64];
block8x8.CopyTo(actual);
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
index 65d0a01ffe..975378b5f8 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
@@ -60,7 +60,7 @@ internal static partial class LibJpegTools
internal void MakeBlock(Block8x8 block, int y, int x)
{
- block.TransposeInplace();
+ block.TransposeInPlace();
this.MakeBlock(block.ToArray(), y, x);
}