From bd9f06f42be1d11df0b5080b04e52e577935aa26 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 28 Sep 2021 23:20:03 +0300 Subject: [PATCH] FDCT sse path via Vector4 --- .../FastFloatingPointDCT.Intrinsic.cs | 88 +---------- .../Jpeg/Components/FastFloatingPointDCT.cs | 142 ++++++++++++++---- 2 files changed, 114 insertions(+), 116 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index 7d92c34682..f40ae6e874 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -18,11 +18,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f); private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f); - private static readonly Vector128 mm128_F_0_7071 = Vector128.Create(0.707106781f); - private static readonly Vector128 mm128_F_0_3826 = Vector128.Create(0.382683433f); - private static readonly Vector128 mm128_F_0_5411 = Vector128.Create(0.541196100f); - private static readonly Vector128 mm128_F_1_3065 = Vector128.Create(1.306562965f); - private static readonly Vector256 mm256_F_1_1758 = Vector256.Create(1.175876f); private static readonly Vector256 mm256_F_n1_9615 = Vector256.Create(-1.961570560f); private static readonly Vector256 mm256_F_n0_3901 = Vector256.Create(-0.390180644f); @@ -40,92 +35,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Apply floating point FDCT inplace using simd operations. /// /// Input matrix. - private static void ForwardTransformSimd(ref Block8x8F block) + private static void ForwardTransform_Avx(ref Block8x8F block) { - DebugGuard.IsTrue(Avx.IsSupported || Sse.IsSupported, "Avx or at least Sse support is required to execute this operation."); + DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); // First pass - process rows block.TransposeInplace(); - if (Avx.IsSupported) - { - FDCT8x8_Avx(ref block); - } - else - { - // Left part - FDCT8x4_Sse(ref Unsafe.As>(ref block.V0L)); - - // Right part - FDCT8x4_Sse(ref Unsafe.As>(ref block.V0R)); - } + FDCT8x8_Avx(ref block); // Second pass - process columns block.TransposeInplace(); - if (Avx.IsSupported) - { - FDCT8x8_Avx(ref block); - } - else - { - // Left part - FDCT8x4_Sse(ref Unsafe.As>(ref block.V0L)); - - // Right part - FDCT8x4_Sse(ref Unsafe.As>(ref block.V0R)); - } - } - - /// - /// Apply 1D floating point FDCT inplace using SSE operations on 8x4 part of 8x8 matrix. - /// - /// - /// Requires Sse support. - /// Must be called on both 8x4 matrix parts for the full FDCT transform. - /// - /// Input reference to the first - public static void FDCT8x4_Sse(ref Vector128 blockRef) - { - DebugGuard.IsTrue(Sse.IsSupported, "Sse support is required to execute this operation."); - - Vector128 tmp0 = Sse.Add(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14)); - Vector128 tmp7 = Sse.Subtract(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14)); - Vector128 tmp1 = Sse.Add(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12)); - Vector128 tmp6 = Sse.Subtract(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12)); - Vector128 tmp2 = Sse.Add(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10)); - Vector128 tmp5 = Sse.Subtract(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10)); - Vector128 tmp3 = Sse.Add(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8)); - Vector128 tmp4 = Sse.Subtract(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8)); - - // Even part - Vector128 tmp10 = Sse.Add(tmp0, tmp3); - Vector128 tmp13 = Sse.Subtract(tmp0, tmp3); - Vector128 tmp11 = Sse.Add(tmp1, tmp2); - Vector128 tmp12 = Sse.Subtract(tmp1, tmp2); - - Unsafe.Add(ref blockRef, 0) = Sse.Add(tmp10, tmp11); - Unsafe.Add(ref blockRef, 8) = Sse.Subtract(tmp10, tmp11); - - Vector128 z1 = Sse.Multiply(Sse.Add(tmp12, tmp13), mm128_F_0_7071); - Unsafe.Add(ref blockRef, 4) = Sse.Add(tmp13, z1); - Unsafe.Add(ref blockRef, 12) = Sse.Subtract(tmp13, z1); - - // Odd part - tmp10 = Sse.Add(tmp4, tmp5); - tmp11 = Sse.Add(tmp5, tmp6); - tmp12 = Sse.Add(tmp6, tmp7); - - Vector128 z5 = Sse.Multiply(Sse.Subtract(tmp10, tmp12), mm128_F_0_3826); - Vector128 z2 = Sse.Add(Sse.Multiply(mm128_F_0_5411, tmp10), z5); - Vector128 z4 = Sse.Add(Sse.Multiply(mm128_F_1_3065, tmp12), z5); - Vector128 z3 = Sse.Multiply(tmp11, mm128_F_0_7071); - - Vector128 z11 = Sse.Add(tmp7, z3); - Vector128 z13 = Sse.Subtract(tmp7, z3); - - Unsafe.Add(ref blockRef, 10) = Sse.Add(z13, z2); - Unsafe.Add(ref blockRef, 6) = Sse.Subtract(z13, z2); - Unsafe.Add(ref blockRef, 2) = Sse.Add(z11, z4); - Unsafe.Add(ref blockRef, 14) = Sse.Subtract(z11, z4); + FDCT8x8_Avx(ref block); } /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 985dac1bd8..43f6b7a1fc 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -18,30 +18,27 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { #pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore private const float C_1_175876 = 1.175875602f; - private const float C_1_961571 = -1.961570560f; - private const float C_0_390181 = -0.390180644f; - private const float C_0_899976 = -0.899976223f; - private const float C_2_562915 = -2.562915447f; - private const float C_0_298631 = 0.298631336f; - private const float C_2_053120 = 2.053119869f; - private const float C_3_072711 = 3.072711026f; - private const float C_1_501321 = 1.501321110f; - private const float C_0_541196 = 0.541196100f; - private const float C_1_847759 = -1.847759065f; - private const float C_0_765367 = 0.765366865f; private const float C_0_125 = 0.1250f; + +#pragma warning disable SA1311, IDE1006 // naming rules violation warnings + private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f); + private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f); + private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f); + private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f); +#pragma warning restore SA1311, IDE1006 + #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore /// @@ -80,23 +77,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, }; - /// - /// Apply floating point IDCT inplace. - /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. - /// - /// Input matrix. - /// Matrix to store temporal results. - public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp) - { - block.TransposeInplace(); - IDCT8x8(ref block, ref temp); - temp.TransposeInplace(); - IDCT8x8(ref temp, ref block); - - // TODO: This can be fused into quantization table step - block.MultiplyInPlace(C_0_125); - } - /// /// Apply 2D floating point FDCT inplace. /// @@ -104,14 +84,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components public static void TransformFDCT(ref Block8x8F block) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse.IsSupported) + if (Avx.IsSupported) { - ForwardTransformSimd(ref block); + ForwardTransform_Avx(ref block); } else #endif + if (Vector.IsHardwareAccelerated) { - ForwardTransformScalar(ref block); + ForwardTransform_Vector4(ref block); + } + else + { + ForwardTransform_Scalar(ref block); } } @@ -122,7 +107,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c. /// /// Input matrix. - private static void ForwardTransformScalar(ref Block8x8F block) + private static void ForwardTransform_Scalar(ref Block8x8F block) { const int dctSize = 8; @@ -225,6 +210,99 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } } + /// + /// Apply floating point FDCT inplace using API. + /// + /// + /// This implementation must be called only if hardware supports 4 + /// floating point numbers vector. Otherwise explicit scalar + /// implementation is faster + /// because it does not rely on matrix transposition. + /// + /// Input matrix. + private static void ForwardTransform_Vector4(ref Block8x8F block) + { + DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware."); + + // First pass - process rows + block.TransposeInplace(); + FDCT8x4_Vector4(ref block.V0L); + FDCT8x4_Vector4(ref block.V0R); + + // Second pass - process columns + block.TransposeInplace(); + FDCT8x4_Vector4(ref block.V0L); + FDCT8x4_Vector4(ref block.V0R); + } + + /// + /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix. + /// + /// + /// Implemented using Vector4 API operations for either scalar or sse hardware implementation. + /// Must be called on both 8x4 matrix parts for the full FDCT transform. + /// + /// Input reference to the first + private static void FDCT8x4_Vector4(ref Vector4 blockRef) + { + Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14); + Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14); + Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12); + Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12); + Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10); + Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10); + Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8); + Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8); + + // Even part + Vector4 tmp10 = tmp0 + tmp3; + Vector4 tmp13 = tmp0 - tmp3; + Vector4 tmp11 = tmp1 + tmp2; + Vector4 tmp12 = tmp1 - tmp2; + + Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11; + Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11; + + Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071; + Unsafe.Add(ref blockRef, 4) = tmp13 + z1; + Unsafe.Add(ref blockRef, 12) = tmp13 - z1; + + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826; + Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5; + Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5; + Vector4 z3 = tmp11 * mm128_F_0_7071; + + Vector4 z11 = tmp7 + z3; + Vector4 z13 = tmp7 - z3; + + Unsafe.Add(ref blockRef, 10) = z13 + z2; + Unsafe.Add(ref blockRef, 6) = z13 - z2; + Unsafe.Add(ref blockRef, 2) = z11 + z4; + Unsafe.Add(ref blockRef, 14) = z11 - z4; + } + + /// + /// Apply floating point IDCT inplace. + /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. + /// + /// Input matrix. + /// Matrix to store temporal results. + public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp) + { + block.TransposeInplace(); + IDCT8x8(ref block, ref temp); + temp.TransposeInplace(); + IDCT8x8(ref temp, ref block); + + // TODO: This can be fused into quantization table step + block.MultiplyInPlace(C_0_125); + } + /// /// Performs 8x8 matrix Inverse Discrete Cosine Transform ///