diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 8533b21518..e155e45361 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -619,29 +619,6 @@ internal static partial class SimdUtils return va + (vm0 * vm1); } - /// - /// Performs a multiplication and a subtraction of the . - /// TODO: Fix. The arguments are in a different order to the FMA intrinsic. - /// - /// ret = (vm0 * vm1) - vs - /// The vector to subtract from the intermediate result. - /// The first vector to multiply. - /// The second vector to multiply. - /// The . - [MethodImpl(InliningOptions.ShortMethod)] - public static Vector256 MultiplySubtract( - Vector256 vs, - Vector256 vm0, - Vector256 vm1) - { - if (Fma.IsSupported) - { - return Fma.MultiplySubtract(vm1, vm0, vs); - } - - return Avx.Subtract(Avx.Multiply(vm0, vm1), vs); - } - /// /// Performs a multiplication and a negated addition of the . /// diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 8b22a51379..c835d267d8 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -140,6 +140,28 @@ internal static class Vector256_ return va + (vm0 * vm1); } + /// + /// Performs a multiplication and a subtraction of the . + /// + /// ret = (vm0 * vm1) - vs + /// The vector to subtract from the intermediate result. + /// The first vector to multiply. + /// The second vector to multiply. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplySubtract( + Vector256 vs, + Vector256 vm0, + Vector256 vm1) + { + if (Fma.IsSupported) + { + return Fma.MultiplySubtract(vm1, vm0, vs); + } + + return (vm0 * vm1) - vs; + } + /// /// Packs signed 32-bit integers to signed 16-bit integers and saturates. /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index 01d112bd6f..731ad0f765 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -211,10 +211,10 @@ internal partial struct Block8x8 } /// - /// Transpose the block inplace. + /// Transpose the block in place. /// [MethodImpl(InliningOptions.ShortMethod)] - public void TransposeInplace() + public void TransposeInPlace() { ref short elemRef = ref Unsafe.As(ref this); diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs deleted file mode 100644 index 862c774699..0000000000 --- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Six Labors Split License. - -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; - -namespace SixLabors.ImageSharp.Formats.Jpeg.Components; - -internal static partial class FloatingPointDCT -{ - /// - /// Apply floating point FDCT inplace using simd operations. - /// - /// Input block. - private static void FDCT8x8_Avx(ref Block8x8F block) - { - DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); - - // First pass - process columns - FDCT8x8_1D_Avx(ref block); - - // Second pass - process rows - block.TransposeInPlace(); - FDCT8x8_1D_Avx(ref block); - - // Applies 1D floating point FDCT inplace - static void FDCT8x8_1D_Avx(ref Block8x8F block) - { - Vector256 tmp0 = Avx.Add(block.V256_0, block.V256_7); - Vector256 tmp7 = Avx.Subtract(block.V256_0, block.V256_7); - Vector256 tmp1 = Avx.Add(block.V256_1, block.V256_6); - Vector256 tmp6 = Avx.Subtract(block.V256_1, block.V256_6); - Vector256 tmp2 = Avx.Add(block.V256_2, block.V256_5); - Vector256 tmp5 = Avx.Subtract(block.V256_2, block.V256_5); - Vector256 tmp3 = Avx.Add(block.V256_3, block.V256_4); - Vector256 tmp4 = Avx.Subtract(block.V256_3, block.V256_4); - - // Even part - Vector256 tmp10 = Avx.Add(tmp0, tmp3); - Vector256 tmp13 = Avx.Subtract(tmp0, tmp3); - Vector256 tmp11 = Avx.Add(tmp1, tmp2); - Vector256 tmp12 = Avx.Subtract(tmp1, tmp2); - - block.V256_0 = Avx.Add(tmp10, tmp11); - block.V256_4 = Avx.Subtract(tmp10, tmp11); - - var mm256_F_0_7071 = Vector256.Create(0.707106781f); - Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); - block.V256_2 = Avx.Add(tmp13, z1); - block.V256_6 = Avx.Subtract(tmp13, z1); - - // Odd part - tmp10 = Avx.Add(tmp4, tmp5); - tmp11 = Avx.Add(tmp5, tmp6); - tmp12 = Avx.Add(tmp6, tmp7); - - Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), Vector256.Create(0.382683433f)); // mm256_F_0_3826 - Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411 - Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065 - Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071); - - Vector256 z11 = Avx.Add(tmp7, z3); - Vector256 z13 = Avx.Subtract(tmp7, z3); - - block.V256_5 = Avx.Add(z13, z2); - block.V256_3 = Avx.Subtract(z13, z2); - block.V256_1 = Avx.Add(z11, z4); - block.V256_7 = Avx.Subtract(z11, z4); - } - } - - /// - /// Apply floating point IDCT inplace using simd operations. - /// - /// Transposed input block. - private static void IDCT8x8_Avx(ref Block8x8F transposedBlock) - { - DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); - - // First pass - process columns - IDCT8x8_1D_Avx(ref transposedBlock); - - // Second pass - process rows - transposedBlock.TransposeInPlace(); - IDCT8x8_1D_Avx(ref transposedBlock); - - // Applies 1D floating point FDCT inplace - static void IDCT8x8_1D_Avx(ref Block8x8F block) - { - // Even part - Vector256 tmp0 = block.V256_0; - Vector256 tmp1 = block.V256_2; - Vector256 tmp2 = block.V256_4; - Vector256 tmp3 = block.V256_6; - - Vector256 z5 = tmp0; - Vector256 tmp10 = Avx.Add(z5, tmp2); - Vector256 tmp11 = Avx.Subtract(z5, tmp2); - - var mm256_F_1_4142 = Vector256.Create(1.414213562f); - Vector256 tmp13 = Avx.Add(tmp1, tmp3); - Vector256 tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142); - - tmp0 = Avx.Add(tmp10, tmp13); - tmp3 = Avx.Subtract(tmp10, tmp13); - tmp1 = Avx.Add(tmp11, tmp12); - tmp2 = Avx.Subtract(tmp11, tmp12); - - // Odd part - Vector256 tmp4 = block.V256_1; - Vector256 tmp5 = block.V256_3; - Vector256 tmp6 = block.V256_5; - Vector256 tmp7 = block.V256_7; - - Vector256 z13 = Avx.Add(tmp6, tmp5); - Vector256 z10 = Avx.Subtract(tmp6, tmp5); - Vector256 z11 = Avx.Add(tmp4, tmp7); - Vector256 z12 = Avx.Subtract(tmp4, tmp7); - - tmp7 = Avx.Add(z11, z13); - tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142); - - z5 = Avx.Multiply(Avx.Add(z10, z12), Vector256.Create(1.847759065f)); // mm256_F_1_8477 - - tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823 - tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131 - - tmp6 = Avx.Subtract(tmp12, tmp7); - tmp5 = Avx.Subtract(tmp11, tmp6); - tmp4 = Avx.Subtract(tmp10, tmp5); - - block.V256_0 = Avx.Add(tmp0, tmp7); - block.V256_7 = Avx.Subtract(tmp0, tmp7); - block.V256_1 = Avx.Add(tmp1, tmp6); - block.V256_6 = Avx.Subtract(tmp1, tmp6); - block.V256_2 = Avx.Add(tmp2, tmp5); - block.V256_5 = Avx.Subtract(tmp2, tmp5); - block.V256_3 = Avx.Add(tmp3, tmp4); - block.V256_4 = Avx.Subtract(tmp3, tmp4); - } - } -} diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs new file mode 100644 index 0000000000..bcd8c70431 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs @@ -0,0 +1,142 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Runtime.Intrinsics; +using SixLabors.ImageSharp.Common.Helpers; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +internal static partial class FloatingPointDCT +{ + /// + /// Apply floating point FDCT in place using simd operations. + /// + /// Input block. + private static void FDCT8x8_Vector256(ref Block8x8F block) + { + DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation."); + + // First pass - process columns + FDCT8x8_1D_Vector256(ref block); + + // Second pass - process rows + block.TransposeInPlace(); + FDCT8x8_1D_Vector256(ref block); + + // Applies 1D floating point FDCT in place + static void FDCT8x8_1D_Vector256(ref Block8x8F block) + { + Vector256 tmp0 = block.V256_0 + block.V256_7; + Vector256 tmp7 = block.V256_0 - block.V256_7; + Vector256 tmp1 = block.V256_1 + block.V256_6; + Vector256 tmp6 = block.V256_1 - block.V256_6; + Vector256 tmp2 = block.V256_2 + block.V256_5; + Vector256 tmp5 = block.V256_2 - block.V256_5; + Vector256 tmp3 = block.V256_3 + block.V256_4; + Vector256 tmp4 = block.V256_3 - block.V256_4; + + // Even part + Vector256 tmp10 = tmp0 + tmp3; + Vector256 tmp13 = tmp0 - tmp3; + Vector256 tmp11 = tmp1 + tmp2; + Vector256 tmp12 = tmp1 - tmp2; + + block.V256_0 = tmp10 + tmp11; + block.V256_4 = tmp10 - tmp11; + + Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f); + Vector256 z1 = (tmp12 + tmp13) * mm256_F_0_7071; + block.V256_2 = tmp13 + z1; + block.V256_6 = tmp13 - z1; + + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + Vector256 z5 = (tmp10 - tmp12) * Vector256.Create(0.382683433f); // mm256_F_0_3826 + Vector256 z2 = Vector256_.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411 + Vector256 z4 = Vector256_.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065 + Vector256 z3 = tmp11 * mm256_F_0_7071; + + Vector256 z11 = tmp7 + z3; + Vector256 z13 = tmp7 - z3; + + block.V256_5 = z13 + z2; + block.V256_3 = z13 - z2; + block.V256_1 = z11 + z4; + block.V256_7 = z11 - z4; + } + } + + /// + /// Apply floating point IDCT in place using simd operations. + /// + /// Transposed input block. + private static void IDCT8x8_Vector256(ref Block8x8F transposedBlock) + { + DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation."); + + // First pass - process columns + IDCT8x8_1D_Vector256(ref transposedBlock); + + // Second pass - process rows + transposedBlock.TransposeInPlace(); + IDCT8x8_1D_Vector256(ref transposedBlock); + + // Applies 1D floating point FDCT in place + static void IDCT8x8_1D_Vector256(ref Block8x8F block) + { + // Even part + Vector256 tmp0 = block.V256_0; + Vector256 tmp1 = block.V256_2; + Vector256 tmp2 = block.V256_4; + Vector256 tmp3 = block.V256_6; + + Vector256 z5 = tmp0; + Vector256 tmp10 = z5 + tmp2; + Vector256 tmp11 = z5 - tmp2; + + Vector256 mm256_F_1_4142 = Vector256.Create(1.414213562f); + Vector256 tmp13 = tmp1 + tmp3; + Vector256 tmp12 = Vector256_.MultiplySubtract(tmp13, tmp1 - tmp3, mm256_F_1_4142); + + tmp0 = tmp10 + tmp13; + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + // Odd part + Vector256 tmp4 = block.V256_1; + Vector256 tmp5 = block.V256_3; + Vector256 tmp6 = block.V256_5; + Vector256 tmp7 = block.V256_7; + + Vector256 z13 = tmp6 + tmp5; + Vector256 z10 = tmp6 - tmp5; + Vector256 z11 = tmp4 + tmp7; + Vector256 z12 = tmp4 - tmp7; + + tmp7 = z11 + z13; + tmp11 = (z11 - z13) * mm256_F_1_4142; + + z5 = (z10 + z12) * Vector256.Create(1.847759065f); // mm256_F_1_8477 + + tmp10 = Vector256_.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823 + tmp12 = Vector256_.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131 + + tmp6 = tmp12 - tmp7; + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 - tmp5; + + block.V256_0 = tmp0 + tmp7; + block.V256_7 = tmp0 - tmp7; + block.V256_1 = tmp1 + tmp6; + block.V256_6 = tmp1 - tmp6; + block.V256_2 = tmp2 + tmp5; + block.V256_5 = tmp2 - tmp5; + block.V256_3 = tmp3 + tmp4; + block.V256_4 = tmp3 - tmp4; + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs index 4c22307cfe..8122d8daa6 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs @@ -4,7 +4,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Jpeg.Components; @@ -101,7 +101,7 @@ internal static partial class FloatingPointDCT } /// - /// Apply 2D floating point IDCT inplace. + /// Apply 2D floating point IDCT in place. /// /// /// Input block must be dequantized with quantization table @@ -110,9 +110,9 @@ internal static partial class FloatingPointDCT /// Input block. public static void TransformIDCT(ref Block8x8F block) { - if (Avx.IsSupported) + if (Vector256.IsHardwareAccelerated) { - IDCT8x8_Avx(ref block); + IDCT8x8_Vector256(ref block); } else { @@ -121,7 +121,7 @@ internal static partial class FloatingPointDCT } /// - /// Apply 2D floating point IDCT inplace. + /// Apply 2D floating point IDCT in place. /// /// /// Input block must be quantized after this method with quantization @@ -130,9 +130,9 @@ internal static partial class FloatingPointDCT /// Input block. public static void TransformFDCT(ref Block8x8F block) { - if (Avx.IsSupported) + if (Vector256.IsHardwareAccelerated) { - FDCT8x8_Avx(ref block); + FDCT8x8_Vector256(ref block); } else { diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs index b5d364dd38..cb8f52a96f 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs @@ -271,7 +271,7 @@ public class Block8x8Tests : JpegFixture Block8x8 block8x8 = Block8x8.Load(Create8x8ShortData()); - block8x8.TransposeInplace(); + block8x8.TransposeInPlace(); short[] actual = new short[64]; block8x8.CopyTo(actual); diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs index 65d0a01ffe..975378b5f8 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs @@ -60,7 +60,7 @@ internal static partial class LibJpegTools internal void MakeBlock(Block8x8 block, int y, int x) { - block.TransposeInplace(); + block.TransposeInPlace(); this.MakeBlock(block.ToArray(), y, x); }