From fb038aaf3c6af75ecedecee38ab11dedc2655881 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Fri, 10 Sep 2021 06:42:03 +0300 Subject: [PATCH] Tidied up DCT code --- .../Formats/Jpeg/Components/Block8x8F.cs | 109 ++++---- .../FastFloatingPointDCT.Intrinsic.cs | 230 +++++++++++++++- .../Jpeg/Components/FastFloatingPointDCT.cs | 247 ++---------------- 3 files changed, 284 insertions(+), 302 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 0b78735852..a25c572aec 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -450,21 +450,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components a.V7R *= b.V7R; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor) - { - var neg = new Vector4(-1); - var add = new Vector4(.5F); - - // sign(dividend) = max(min(dividend, 1), -1) - Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One); - - // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend) - // TODO: This is wrong but I have no idea how to fix it without if-else operator - // sign here is a value in range [-1..1], it can be equal to -0.2 for example which is wrong - return (dividend / divisor) + (sign * add); - } - public void RoundInto(ref Block8x8 dest) { for (int i = 0; i < Size; i++) @@ -562,6 +547,47 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components Unsafe.Add(ref dRef, 7) = bottom; } + /// + /// Compares entire 8x8 block to a single scalar value. + /// + /// Value to compare to. + public bool EqualsToScalar(int value) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); + + var targetVector = Vector256.Create(value); + ref Vector256 blockStride = ref this.V0; + + for (int i = 0; i < RowCount; i++) + { + Vector256 areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector); + if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask) + { + return false; + } + } + + return true; + } +#endif + { + ref float scalars = ref Unsafe.As(ref this); + + for (int i = 0; i < Size; i++) + { + if ((int)Unsafe.Add(ref scalars, i) != value) + { + return false; + } + } + + return true; + } + } + /// public bool Equals(Block8x8F other) => this.V0L == other.V0L @@ -598,15 +624,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components return sb.ToString(); } - [MethodImpl(InliningOptions.ShortMethod)] - private static Vector NormalizeAndRound(Vector row, Vector off, Vector max) - { - row += off; - row = Vector.Max(row, Vector.Zero); - row = Vector.Min(row, max); - return row.FastRound(); - } - /// /// Transpose the block inplace. /// @@ -650,45 +667,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } } - /// - /// Compares entire 8x8 block to a single scalar value. - /// - /// Value to compare to. - public bool EqualsToScalar(int value) + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector NormalizeAndRound(Vector row, Vector off, Vector max) { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) - { - const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); - - var targetVector = Vector256.Create(value); - ref Vector256 blockStride = ref this.V0; - - for (int i = 0; i < RowCount; i++) - { - Vector256 areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector); - if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask) - { - return false; - } - } - - return true; - } -#endif - { - ref float scalars = ref Unsafe.As(ref this); - - for (int i = 0; i < Size; i++) - { - if ((int)Unsafe.Add(ref scalars, i) != value) - { - return false; - } - } - - return true; - } + row += off; + row = Vector.Max(row, Vector.Zero); + row = Vector.Min(row, max); + return row.FastRound(); } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index eb60445d3f..acc83e2799 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -14,6 +14,30 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { internal static partial class FastFloatingPointDCT { +#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings + private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f); + private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f); + private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f); + private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f); + + private static readonly Vector128 mm128_F_0_7071 = Vector128.Create(0.707106781f); + private static readonly Vector128 mm128_F_0_3826 = Vector128.Create(0.382683433f); + private static readonly Vector128 mm128_F_0_5411 = Vector128.Create(0.541196100f); + private static readonly Vector128 mm128_F_1_3065 = Vector128.Create(1.306562965f); + + private static readonly Vector256 mm256_F_1_1758 = Vector256.Create(1.175876f); + private static readonly Vector256 mm256_F_n1_9615 = Vector256.Create(-1.961570560f); + private static readonly Vector256 mm256_F_n0_3901 = Vector256.Create(-0.390180644f); + private static readonly Vector256 mm256_F_n0_8999 = Vector256.Create(-0.899976223f); + private static readonly Vector256 mm256_F_n2_5629 = Vector256.Create(-2.562915447f); + private static readonly Vector256 mm256_F_0_2986 = Vector256.Create(0.298631336f); + private static readonly Vector256 mm256_F_2_0531 = Vector256.Create(2.053119869f); + private static readonly Vector256 mm256_F_3_0727 = Vector256.Create(3.072711026f); + private static readonly Vector256 mm256_F_1_5013 = Vector256.Create(1.501321110f); + private static readonly Vector256 mm256_F_n1_8477 = Vector256.Create(-1.847759065f); + private static readonly Vector256 mm256_F_0_7653 = Vector256.Create(0.765366865f); +#pragma warning restore SA1310, SA1311, IDE1006 + /// /// Gets reciprocal coefficients for jpeg quantization tables calculation. /// @@ -50,18 +74,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, }; -#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings - private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f); - private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f); - private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f); - private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f); - - private static readonly Vector128 mm128_F_0_7071 = Vector128.Create(0.707106781f); - private static readonly Vector128 mm128_F_0_3826 = Vector128.Create(0.382683433f); - private static readonly Vector128 mm128_F_0_5411 = Vector128.Create(0.541196100f); - private static readonly Vector128 mm128_F_1_3065 = Vector128.Create(1.306562965f); -#pragma warning restore SA1310, SA1311, IDE1006 - /// /// Apply floating point FDCT inplace using simd operations. /// @@ -205,6 +217,200 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components block.V1 = Avx.Add(z11, z4); block.V7 = Avx.Subtract(z11, z4); } + + /// + /// Performs 8x8 matrix Inverse Discrete Cosine Transform + /// + /// Source + /// Destination + public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported) + { + IDCT8x8_Avx(ref s, ref d); + } + else +#endif + { + IDCT8x4_LeftPart(ref s, ref d); + IDCT8x4_RightPart(ref s, ref d); + } + } + + /// + /// Do IDCT internal operations on the left part of the block. Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// Destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1L; + Vector4 my7 = s.V7L; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3L; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5L; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2L; + Vector4 my6 = s.V6L; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0L; + Vector4 my4 = s.V4L; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0L = my0 + mb0; + d.V7L = my0 - mb0; + d.V1L = my1 + mb1; + d.V6L = my1 - mb1; + d.V2L = my2 + mb2; + d.V5L = my2 - mb2; + d.V3L = my3 + mb3; + d.V4L = my3 - mb3; + } + + /// + /// Do IDCT internal operations on the right part of the block. + /// Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// The destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1R; + Vector4 my7 = s.V7R; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3R; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5R; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2R; + Vector4 my6 = s.V6R; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0R; + Vector4 my4 = s.V4R; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0R = my0 + mb0; + d.V7R = my0 - mb0; + d.V1R = my1 + mb1; + d.V6R = my1 - mb1; + d.V2R = my2 + mb2; + d.V5R = my2 - mb2; + d.V3R = my3 + mb3; + d.V4R = my3 - mb3; + } + + /// + /// Combined operation of and + /// using AVX commands. + /// + /// Source + /// Destination + public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); + + Vector256 my1 = s.V1; + Vector256 my7 = s.V7; + Vector256 mz0 = Avx.Add(my1, my7); + + Vector256 my3 = s.V3; + Vector256 mz2 = Avx.Add(my3, my7); + Vector256 my5 = s.V5; + Vector256 mz1 = Avx.Add(my3, my5); + Vector256 mz3 = Avx.Add(my1, my5); + + Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758); + + mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615); + mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901); + mz0 = Avx.Multiply(mz0, mm256_F_n0_8999); + mz1 = Avx.Multiply(mz1, mm256_F_n2_5629); + + Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2); + Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3); + Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2); + Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3); + + Vector256 my2 = s.V2; + Vector256 my6 = s.V6; + mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411); + Vector256 my0 = s.V0; + Vector256 my4 = s.V4; + mz0 = Avx.Add(my0, my4); + mz1 = Avx.Subtract(my0, my4); + mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477); + mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653); + + my0 = Avx.Add(mz0, mz3); + my3 = Avx.Subtract(mz0, mz3); + my1 = Avx.Add(mz1, mz2); + my2 = Avx.Subtract(mz1, mz2); + + d.V0 = Avx.Add(my0, mb0); + d.V7 = Avx.Subtract(my0, mb0); + d.V1 = Avx.Add(my1, mb1); + d.V6 = Avx.Subtract(my1, mb1); + d.V2 = Avx.Add(my2, mb2); + d.V5 = Avx.Subtract(my2, mb2); + d.V3 = Avx.Add(my3, mb3); + d.V4 = Avx.Subtract(my3, mb3); +#endif + } } } #endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index a554e8577b..181f18185b 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -43,216 +43,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private const float C_0_765367 = 0.765366865f; private const float C_0_125 = 0.1250f; - -#if SUPPORTS_RUNTIME_INTRINSICS - private static readonly Vector256 C_V_0_5411 = Vector256.Create(0.541196f); - private static readonly Vector256 C_V_1_1758 = Vector256.Create(1.175876f); - - private static readonly Vector256 C_V_n1_9615 = Vector256.Create(-1.961570560f); - private static readonly Vector256 C_V_n0_3901 = Vector256.Create(-0.390180644f); - private static readonly Vector256 C_V_n0_8999 = Vector256.Create(-0.899976223f); - private static readonly Vector256 C_V_n2_5629 = Vector256.Create(-2.562915447f); - private static readonly Vector256 C_V_0_2986 = Vector256.Create(0.298631336f); - private static readonly Vector256 C_V_2_0531 = Vector256.Create(2.053119869f); - private static readonly Vector256 C_V_3_0727 = Vector256.Create(3.072711026f); - private static readonly Vector256 C_V_1_5013 = Vector256.Create(1.501321110f); - private static readonly Vector256 C_V_n1_8477 = Vector256.Create(-1.847759065f); - private static readonly Vector256 C_V_0_7653 = Vector256.Create(0.765366865f); -#endif #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore - /// - /// Performs 8x8 matrix Inverse Discrete Cosine Transform - /// - /// Source - /// Destination - public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) - { - IDCT8x8_Avx(ref s, ref d); - } - else -#endif - { - IDCT8x4_LeftPart(ref s, ref d); - IDCT8x4_RightPart(ref s, ref d); - } - } - - /// - /// Do IDCT internal operations on the left part of the block. Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// Destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1L; - Vector4 my7 = s.V7L; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3L; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5L; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2L; - Vector4 my6 = s.V6L; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0L; - Vector4 my4 = s.V4L; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0L = my0 + mb0; - d.V7L = my0 - mb0; - d.V1L = my1 + mb1; - d.V6L = my1 - mb1; - d.V2L = my2 + mb2; - d.V5L = my2 - mb2; - d.V3L = my3 + mb3; - d.V4L = my3 - mb3; - } - - /// - /// Do IDCT internal operations on the right part of the block. - /// Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// The destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1R; - Vector4 my7 = s.V7R; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3R; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5R; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2R; - Vector4 my6 = s.V6R; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0R; - Vector4 my4 = s.V4R; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0R = my0 + mb0; - d.V7R = my0 - mb0; - d.V1R = my1 + mb1; - d.V6R = my1 - mb1; - d.V2R = my2 + mb2; - d.V5R = my2 - mb2; - d.V3R = my3 + mb3; - d.V4R = my3 - mb3; - } - - /// - /// Combined operation of and - /// using AVX commands. - /// - /// Source - /// Destination - public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); - - Vector256 my1 = s.V1; - Vector256 my7 = s.V7; - Vector256 mz0 = Avx.Add(my1, my7); - - Vector256 my3 = s.V3; - Vector256 mz2 = Avx.Add(my3, my7); - Vector256 my5 = s.V5; - Vector256 mz1 = Avx.Add(my3, my5); - Vector256 mz3 = Avx.Add(my1, my5); - - Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758); - - mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615); - mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901); - mz0 = Avx.Multiply(mz0, C_V_n0_8999); - mz1 = Avx.Multiply(mz1, C_V_n2_5629); - - Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2); - Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3); - Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2); - Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3); - - Vector256 my2 = s.V2; - Vector256 my6 = s.V6; - mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411); - Vector256 my0 = s.V0; - Vector256 my4 = s.V4; - mz0 = Avx.Add(my0, my4); - mz1 = Avx.Subtract(my0, my4); - mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477); - mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653); - - my0 = Avx.Add(mz0, mz3); - my3 = Avx.Subtract(mz0, mz3); - my1 = Avx.Add(mz1, mz2); - my2 = Avx.Subtract(mz1, mz2); - - d.V0 = Avx.Add(my0, mb0); - d.V7 = Avx.Subtract(my0, mb0); - d.V1 = Avx.Add(my1, mb1); - d.V6 = Avx.Subtract(my1, mb1); - d.V2 = Avx.Add(my2, mb2); - d.V5 = Avx.Subtract(my2, mb2); - d.V3 = Avx.Add(my3, mb3); - d.V4 = Avx.Subtract(my3, mb3); -#endif - } /// /// Apply floating point IDCT inplace. @@ -267,10 +58,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components temp.Transpose(); IDCT8x8(ref temp, ref block); - // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? + // TODO: This can be fused into quantization table step block.MultiplyInPlace(C_0_125); } + /// + /// Apply 2D floating point FDCT inplace. + /// + /// Input matrix. + public static void TransformFDCT(ref Block8x8F block) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported || Sse.IsSupported) + { + ForwardTransformSimd(ref block); + } + else +#endif + { + ForwardTransformScalar(ref block); + } + } + /// /// Apply 2D floating point FDCT inplace using scalar operations. /// @@ -380,23 +189,5 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components dataRef = ref Unsafe.Add(ref dataRef, 1); } } - - /// - /// Apply 2D floating point FDCT inplace. - /// - /// Input matrix. - public static void TransformFDCT(ref Block8x8F block) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported || Sse.IsSupported) - { - ForwardTransformSimd(ref block); - } - else -#endif - { - ForwardTransformScalar(ref block); - } - } } }