diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
deleted file mode 100644
index 369172a2d8..0000000000
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.IDCT.cs
+++ /dev/null
@@ -1,235 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using System.Diagnostics;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-#endif
-
-// ReSharper disable InconsistentNaming
-namespace SixLabors.ImageSharp.Formats.Jpeg.Components
-{
- ///
- /// Contains inaccurate, but fast forward and inverse DCT implementations.
- ///
- internal static partial class FastFloatingPointDCT
- {
- ///
- /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
- /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
- ///
- /// Source
- /// Destination
- /// Temporary block provided by the caller
- public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
- {
- src.TransposeInto(ref temp);
-
- IDCT8x8(ref temp, ref dest);
- dest.TransposeInto(ref temp);
- IDCT8x8(ref temp, ref dest);
-
- // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
- dest.MultiplyInPlace(C_0_125);
- }
-
- ///
- /// Performs 8x8 matrix Inverse Discrete Cosine Transform
- ///
- /// Source
- /// Destination
- public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- if (Avx.IsSupported)
- {
- IDCT8x8_Avx(ref s, ref d);
- }
- else
-#endif
- {
- IDCT8x4_LeftPart(ref s, ref d);
- IDCT8x4_RightPart(ref s, ref d);
- }
- }
-
- ///
- /// Do IDCT internal operations on the left part of the block. Original src:
- /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
- ///
- /// The source block
- /// Destination block
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
- {
- Vector4 my1 = s.V1L;
- Vector4 my7 = s.V7L;
- Vector4 mz0 = my1 + my7;
-
- Vector4 my3 = s.V3L;
- Vector4 mz2 = my3 + my7;
- Vector4 my5 = s.V5L;
- Vector4 mz1 = my3 + my5;
- Vector4 mz3 = my1 + my5;
-
- Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
- mz2 = (mz2 * C_1_961571) + mz4;
- mz3 = (mz3 * C_0_390181) + mz4;
- mz0 = mz0 * C_0_899976;
- mz1 = mz1 * C_2_562915;
-
- Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
- Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
- Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
- Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
- Vector4 my2 = s.V2L;
- Vector4 my6 = s.V6L;
- mz4 = (my2 + my6) * C_0_541196;
- Vector4 my0 = s.V0L;
- Vector4 my4 = s.V4L;
- mz0 = my0 + my4;
- mz1 = my0 - my4;
-
- mz2 = mz4 + (my6 * C_1_847759);
- mz3 = mz4 + (my2 * C_0_765367);
-
- my0 = mz0 + mz3;
- my3 = mz0 - mz3;
- my1 = mz1 + mz2;
- my2 = mz1 - mz2;
-
- d.V0L = my0 + mb0;
- d.V7L = my0 - mb0;
- d.V1L = my1 + mb1;
- d.V6L = my1 - mb1;
- d.V2L = my2 + mb2;
- d.V5L = my2 - mb2;
- d.V3L = my3 + mb3;
- d.V4L = my3 - mb3;
- }
-
- ///
- /// Do IDCT internal operations on the right part of the block.
- /// Original src:
- /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
- ///
- /// The source block
- /// The destination block
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
- {
- Vector4 my1 = s.V1R;
- Vector4 my7 = s.V7R;
- Vector4 mz0 = my1 + my7;
-
- Vector4 my3 = s.V3R;
- Vector4 mz2 = my3 + my7;
- Vector4 my5 = s.V5R;
- Vector4 mz1 = my3 + my5;
- Vector4 mz3 = my1 + my5;
-
- Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
- mz2 = (mz2 * C_1_961571) + mz4;
- mz3 = (mz3 * C_0_390181) + mz4;
- mz0 = mz0 * C_0_899976;
- mz1 = mz1 * C_2_562915;
-
- Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
- Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
- Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
- Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
- Vector4 my2 = s.V2R;
- Vector4 my6 = s.V6R;
- mz4 = (my2 + my6) * C_0_541196;
- Vector4 my0 = s.V0R;
- Vector4 my4 = s.V4R;
- mz0 = my0 + my4;
- mz1 = my0 - my4;
-
- mz2 = mz4 + (my6 * C_1_847759);
- mz3 = mz4 + (my2 * C_0_765367);
-
- my0 = mz0 + mz3;
- my3 = mz0 - mz3;
- my1 = mz1 + mz2;
- my2 = mz1 - mz2;
-
- d.V0R = my0 + mb0;
- d.V7R = my0 - mb0;
- d.V1R = my1 + mb1;
- d.V6R = my1 - mb1;
- d.V2R = my2 + mb2;
- d.V5R = my2 - mb2;
- d.V3R = my3 + mb3;
- d.V4R = my3 - mb3;
- }
-
- ///
- /// Combined operation of and
- /// using AVX commands.
- ///
- /// Source
- /// Destination
- public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
- Vector256 my1 = s.V1;
- Vector256 my7 = s.V7;
- Vector256 mz0 = Avx.Add(my1, my7);
-
- Vector256 my3 = s.V3;
- Vector256 mz2 = Avx.Add(my3, my7);
- Vector256 my5 = s.V5;
- Vector256 mz1 = Avx.Add(my3, my5);
- Vector256 mz3 = Avx.Add(my1, my5);
-
- Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758);
-
- mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615);
- mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901);
- mz0 = Avx.Multiply(mz0, C_V_n0_8999);
- mz1 = Avx.Multiply(mz1, C_V_n2_5629);
-
- Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2);
- Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3);
- Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2);
- Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3);
-
-
- Vector256 my2 = s.V2;
- Vector256 my6 = s.V6;
- mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411);
- Vector256 my0 = s.V0;
- Vector256 my4 = s.V4;
- mz0 = Avx.Add(my0, my4);
- mz1 = Avx.Subtract(my0, my4);
- mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477);
- mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653);
-
- my0 = Avx.Add(mz0, mz3);
- my3 = Avx.Subtract(mz0, mz3);
- my1 = Avx.Add(mz1, mz2);
- my2 = Avx.Subtract(mz1, mz2);
-
- d.V0 = Avx.Add(my0, mb0);
- d.V7 = Avx.Subtract(my0, mb0);
- d.V1 = Avx.Add(my1, mb1);
- d.V6 = Avx.Subtract(my1, mb1);
- d.V2 = Avx.Add(my2, mb2);
- d.V5 = Avx.Subtract(my2, mb2);
- d.V3 = Avx.Add(my3, mb3);
- d.V4 = Avx.Subtract(my3, mb3);
-#endif
- }
- }
-}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 493c0a6880..d7101abfd0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -319,5 +319,219 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
dest.MultiplyInPlace(C_0_125);
}
+
+ ///
+ /// Performs 8x8 matrix Inverse Discrete Cosine Transform
+ ///
+ /// Source
+ /// Destination
+ public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx.IsSupported)
+ {
+ IDCT8x8_Avx(ref s, ref d);
+ }
+ else
+#endif
+ {
+ IDCT8x4_LeftPart(ref s, ref d);
+ IDCT8x4_RightPart(ref s, ref d);
+ }
+ }
+
+ ///
+ /// Do IDCT internal operations on the left part of the block. Original src:
+ /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+ ///
+ /// The source block
+ /// Destination block
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
+ {
+ Vector4 my1 = s.V1L;
+ Vector4 my7 = s.V7L;
+ Vector4 mz0 = my1 + my7;
+
+ Vector4 my3 = s.V3L;
+ Vector4 mz2 = my3 + my7;
+ Vector4 my5 = s.V5L;
+ Vector4 mz1 = my3 + my5;
+ Vector4 mz3 = my1 + my5;
+
+ Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+ mz2 = (mz2 * C_1_961571) + mz4;
+ mz3 = (mz3 * C_0_390181) + mz4;
+ mz0 = mz0 * C_0_899976;
+ mz1 = mz1 * C_2_562915;
+
+ Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+ Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+ Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+ Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+ Vector4 my2 = s.V2L;
+ Vector4 my6 = s.V6L;
+ mz4 = (my2 + my6) * C_0_541196;
+ Vector4 my0 = s.V0L;
+ Vector4 my4 = s.V4L;
+ mz0 = my0 + my4;
+ mz1 = my0 - my4;
+
+ mz2 = mz4 + (my6 * C_1_847759);
+ mz3 = mz4 + (my2 * C_0_765367);
+
+ my0 = mz0 + mz3;
+ my3 = mz0 - mz3;
+ my1 = mz1 + mz2;
+ my2 = mz1 - mz2;
+
+ d.V0L = my0 + mb0;
+ d.V7L = my0 - mb0;
+ d.V1L = my1 + mb1;
+ d.V6L = my1 - mb1;
+ d.V2L = my2 + mb2;
+ d.V5L = my2 - mb2;
+ d.V3L = my3 + mb3;
+ d.V4L = my3 - mb3;
+ }
+
+ ///
+ /// Do IDCT internal operations on the right part of the block.
+ /// Original src:
+ /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+ ///
+ /// The source block
+ /// The destination block
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
+ {
+ Vector4 my1 = s.V1R;
+ Vector4 my7 = s.V7R;
+ Vector4 mz0 = my1 + my7;
+
+ Vector4 my3 = s.V3R;
+ Vector4 mz2 = my3 + my7;
+ Vector4 my5 = s.V5R;
+ Vector4 mz1 = my3 + my5;
+ Vector4 mz3 = my1 + my5;
+
+ Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+ mz2 = (mz2 * C_1_961571) + mz4;
+ mz3 = (mz3 * C_0_390181) + mz4;
+ mz0 = mz0 * C_0_899976;
+ mz1 = mz1 * C_2_562915;
+
+ Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+ Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+ Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+ Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+ Vector4 my2 = s.V2R;
+ Vector4 my6 = s.V6R;
+ mz4 = (my2 + my6) * C_0_541196;
+ Vector4 my0 = s.V0R;
+ Vector4 my4 = s.V4R;
+ mz0 = my0 + my4;
+ mz1 = my0 - my4;
+
+ mz2 = mz4 + (my6 * C_1_847759);
+ mz3 = mz4 + (my2 * C_0_765367);
+
+ my0 = mz0 + mz3;
+ my3 = mz0 - mz3;
+ my1 = mz1 + mz2;
+ my2 = mz1 - mz2;
+
+ d.V0R = my0 + mb0;
+ d.V7R = my0 - mb0;
+ d.V1R = my1 + mb1;
+ d.V6R = my1 - mb1;
+ d.V2R = my2 + mb2;
+ d.V5R = my2 - mb2;
+ d.V3R = my3 + mb3;
+ d.V4R = my3 - mb3;
+ }
+
+ ///
+ /// Combined operation of and
+ /// using AVX commands.
+ ///
+ /// Source
+ /// Destination
+ public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
+
+ Vector256 my1 = s.V1;
+ Vector256 my7 = s.V7;
+ Vector256 mz0 = Avx.Add(my1, my7);
+
+ Vector256 my3 = s.V3;
+ Vector256 mz2 = Avx.Add(my3, my7);
+ Vector256 my5 = s.V5;
+ Vector256 mz1 = Avx.Add(my3, my5);
+ Vector256 mz3 = Avx.Add(my1, my5);
+
+ Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758);
+
+ mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615);
+ mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901);
+ mz0 = Avx.Multiply(mz0, C_V_n0_8999);
+ mz1 = Avx.Multiply(mz1, C_V_n2_5629);
+
+ Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2);
+ Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3);
+ Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2);
+ Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3);
+
+
+ Vector256 my2 = s.V2;
+ Vector256 my6 = s.V6;
+ mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411);
+ Vector256 my0 = s.V0;
+ Vector256 my4 = s.V4;
+ mz0 = Avx.Add(my0, my4);
+ mz1 = Avx.Subtract(my0, my4);
+ mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477);
+ mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653);
+
+ my0 = Avx.Add(mz0, mz3);
+ my3 = Avx.Subtract(mz0, mz3);
+ my1 = Avx.Add(mz1, mz2);
+ my2 = Avx.Subtract(mz1, mz2);
+
+ d.V0 = Avx.Add(my0, mb0);
+ d.V7 = Avx.Subtract(my0, mb0);
+ d.V1 = Avx.Add(my1, mb1);
+ d.V6 = Avx.Subtract(my1, mb1);
+ d.V2 = Avx.Add(my2, mb2);
+ d.V5 = Avx.Subtract(my2, mb2);
+ d.V3 = Avx.Add(my3, mb3);
+ d.V4 = Avx.Subtract(my3, mb3);
+#endif
+ }
+
+ ///
+ /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
+ /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
+ ///
+ /// Source
+ /// Destination
+ /// Temporary block provided by the caller
+ public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
+ {
+ src.TransposeInto(ref temp);
+
+ IDCT8x8(ref temp, ref dest);
+ dest.TransposeInto(ref temp);
+ IDCT8x8(ref temp, ref dest);
+
+ // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
+ dest.MultiplyInPlace(C_0_125);
+ }
}
}