From f297fce021ef03e988d7c61c5641e78bcdb895bd Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Fri, 10 Sep 2021 12:27:35 +0300 Subject: [PATCH] Tidied up the code, added benchmarks --- .../Formats/Jpeg/Components/Block8x8.cs | 2 + .../Jpeg/Components/Block8x8F.Intrinsic.cs | 29 +- .../Components/Encoder/HuffmanScanEncoder.cs | 8 +- .../FastFloatingPointDCT.Intrinsic.cs | 172 ----------- .../Jpeg/Components/FastFloatingPointDCT.cs | 173 +++++++++++ .../Jpeg/Components/ZigZag.Intrinsic.cs | 290 +++++++----------- .../BlockOperations/Block8x8F_Quantize.cs | 31 +- .../BlockOperations/Block8x8F_Transpose.cs | 14 + .../Config.HwIntrinsics.cs | 10 +- 9 files changed, 352 insertions(+), 377 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index c76eb942f..71077675d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -5,8 +5,10 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +#endif using System.Text; namespace SixLabors.ImageSharp.Formats.Jpeg.Components diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs index 83227ff07..733d32892 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -35,33 +35,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components [FieldOffset(224)] public Vector256 V7; - private static ReadOnlySpan DivideIntoInt16_Avx2_ShuffleMask => new int[] { - 0, 1, 4, 5, 2, 3, 6, 7 - }; + private static readonly Vector256 MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); - fixed (int* maskPtr = DivideIntoInt16_Avx2_ShuffleMask) - { - Vector256 crossLaneShuffleMask = Avx.LoadVector256(maskPtr).AsInt32(); - - ref Vector256 aBase = ref Unsafe.As>(ref a); - ref Vector256 bBase = ref Unsafe.As>(ref b); + ref Vector256 aBase = ref a.V0; + ref Vector256 bBase = ref b.V0; - ref Vector256 destBase = ref Unsafe.As>(ref dest); + ref Vector256 destRef = ref dest.V01; - for (int i = 0; i < 8; i += 2) - { - Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + for (int i = 0; i < 8; i += 2) + { + Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); - Vector256 row = Avx2.PackSignedSaturate(row0, row1); - row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16(); + Vector256 row = Avx2.PackSignedSaturate(row0, row1); + row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16(); - Unsafe.Add(ref destBase, i / 2) = row; - } + Unsafe.Add(ref destRef, i / 2) = row; } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index da4723e21..75f384848 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -130,7 +130,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder where TPixel : unmanaged, IPixel { // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < 64; i++) + for (int i = 0; i < Block8x8F.Size; i++) { luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i]; @@ -197,7 +197,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder where TPixel : unmanaged, IPixel { // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < 64; i++) + for (int i = 0; i < Block8x8F.Size; i++) { luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i]; @@ -270,7 +270,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder where TPixel : unmanaged, IPixel { // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < 64; i++) + for (int i = 0; i < Block8x8F.Size; i++) { luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; } @@ -321,7 +321,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder where TPixel : unmanaged, IPixel { // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < 64; i++) + for (int i = 0; i < Block8x8F.Size; i++) { luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index d9a04befb..7a2b0a78c 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -2,7 +2,6 @@ // Licensed under the Apache License, Version 2.0. #if SUPPORTS_RUNTIME_INTRINSICS -using System; using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; @@ -37,42 +36,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private static readonly Vector256 mm256_F_0_7653 = Vector256.Create(0.765366865f); #pragma warning restore SA1310, SA1311, IDE1006 - /// - /// Gets reciprocal coefficients for jpeg quantization tables calculation. - /// - /// - /// - /// Current FDCT implementation expects its results to be multiplied by - /// a reciprocal quantization table. Values in this table must be divided - /// by quantization table values scaled with quality settings. - /// - /// - /// These values were calculates with this formula: - /// - /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8; - /// - /// Where: - /// - /// scalefactor[0] = 1 - /// - /// - /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 - /// - /// Values are also scaled by 8 so DCT code won't do unnecessary division. - /// - /// - public static ReadOnlySpan DctReciprocalAdjustmentCoefficients => new float[] - { - 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, - 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, - 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, - 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, - 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, - 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, - 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, - 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, - }; - /// /// Apply floating point FDCT inplace using simd operations. /// @@ -217,141 +180,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components block.V7 = Avx.Subtract(z11, z4); } - /// - /// Performs 8x8 matrix Inverse Discrete Cosine Transform - /// - /// Source - /// Destination - public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) - { - IDCT8x8_Avx(ref s, ref d); - } - else -#endif - { - IDCT8x4_LeftPart(ref s, ref d); - IDCT8x4_RightPart(ref s, ref d); - } - } - - /// - /// Do IDCT internal operations on the left part of the block. Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// Destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1L; - Vector4 my7 = s.V7L; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3L; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5L; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2L; - Vector4 my6 = s.V6L; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0L; - Vector4 my4 = s.V4L; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0L = my0 + mb0; - d.V7L = my0 - mb0; - d.V1L = my1 + mb1; - d.V6L = my1 - mb1; - d.V2L = my2 + mb2; - d.V5L = my2 - mb2; - d.V3L = my3 + mb3; - d.V4L = my3 - mb3; - } - - /// - /// Do IDCT internal operations on the right part of the block. - /// Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// The destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1R; - Vector4 my7 = s.V7R; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3R; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5R; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2R; - Vector4 my6 = s.V6R; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0R; - Vector4 my4 = s.V4R; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0R = my0 + mb0; - d.V7R = my0 - mb0; - d.V1R = my1 + mb1; - d.V6R = my1 - mb1; - d.V2R = my2 + mb2; - d.V5R = my2 - mb2; - d.V3R = my3 + mb3; - d.V4R = my3 - mb3; - } - /// /// Combined operation of and /// using AVX commands. diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 6f68881cd..1c5cfc8d6 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -1,6 +1,8 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; +using System.Numerics; using System.Runtime.CompilerServices; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics.X86; @@ -42,6 +44,42 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private const float C_0_125 = 0.1250f; #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore + /// + /// Gets reciprocal coefficients for jpeg quantization tables calculation. + /// + /// + /// + /// Current FDCT implementation expects its results to be multiplied by + /// a reciprocal quantization table. To get 8x8 reciprocal block values in this + /// table must be divided by quantization table values scaled with quality settings. + /// + /// + /// These values were calculates with this formula: + /// + /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8; + /// + /// Where: + /// + /// scalefactor[0] = 1 + /// + /// + /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + /// + /// Values are also scaled by 8 so DCT code won't do unnecessary division. + /// + /// + public static ReadOnlySpan DctReciprocalAdjustmentCoefficients => new float[] + { + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, + 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, + 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, + 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, + 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, + }; + /// /// Apply floating point IDCT inplace. /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. @@ -186,5 +224,140 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components dataRef = ref Unsafe.Add(ref dataRef, 1); } } + + /// + /// Performs 8x8 matrix Inverse Discrete Cosine Transform + /// + /// Source + /// Destination + public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported) + { + IDCT8x8_Avx(ref s, ref d); + } + else +#endif + { + IDCT8x4_LeftPart(ref s, ref d); + IDCT8x4_RightPart(ref s, ref d); + } + } + + /// + /// Do IDCT internal operations on the left part of the block. Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// Destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1L; + Vector4 my7 = s.V7L; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3L; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5L; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2L; + Vector4 my6 = s.V6L; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0L; + Vector4 my4 = s.V4L; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0L = my0 + mb0; + d.V7L = my0 - mb0; + d.V1L = my1 + mb1; + d.V6L = my1 - mb1; + d.V2L = my2 + mb2; + d.V5L = my2 - mb2; + d.V3L = my3 + mb3; + d.V4L = my3 - mb3; + } + + /// + /// Do IDCT internal operations on the right part of the block. + /// Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// The destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1R; + Vector4 my7 = s.V7R; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3R; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5R; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2R; + Vector4 my6 = s.V6R; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0R; + Vector4 my4 = s.V4R; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0R = my0 + mb0; + d.V7R = my0 - mb0; + d.V1R = my1 + mb1; + d.V6R = my1 - mb1; + d.V2R = my2 + mb2; + d.V5R = my2 - mb2; + d.V3R = my3 + mb3; + d.V4R = my3 - mb3; + } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs index 878a67b50..abe02d040 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -23,82 +23,65 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// private static ReadOnlySpan SseShuffleMasks => new byte[] { - // 0_A + // row0 + // A B C 0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _, - // 0_B _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5, - // 0_C _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, - // 1_A + // row1 + // A B C D E _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11, - // 1_B _, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _, - // 1_C 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _, - // 1_D _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _, - // 1_E _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, - // 2_B + // row2 + // B C D E F G 8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _, - // 2_C _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, - // 2_D _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _, - // 2_E _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, - // 2_F _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, - // 2_G _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, - // 3_A + // row3 + // A B C D + // D shuffle mask is the for row4 E row shuffle mask _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _, - // 3_B _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _, - // 3_C _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, - // 3_D/4_E 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, - // 4_F + // row4 + // E F G H + // 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _, - // 4_G _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, - // 4_H _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _, - // 5_B + // row5 + // B C D E F G _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _, - // 5_C _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, - // 5_D 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _, - // 5_E _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, - // 5_F _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _, - // 5_G _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, - // 6_D + // row6 + // D E F G H _, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, - // 6_E _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _, - // 6_F _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13, - // 6_G _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _, - // 6_H 4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, - // 7_F + // row7 + // F G H _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _, - // 7_G 10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, - // 7_H _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15 }; @@ -177,95 +160,95 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components fixed (byte* maskPtr = SseShuffleMasks) { - Vector128 A = source.V0.AsByte(); - Vector128 B = source.V1.AsByte(); - Vector128 C = source.V2.AsByte(); - Vector128 D = source.V3.AsByte(); - Vector128 E = source.V4.AsByte(); - Vector128 F = source.V5.AsByte(); - Vector128 G = source.V6.AsByte(); - Vector128 H = source.V7.AsByte(); + Vector128 rowA = source.V0.AsByte(); + Vector128 rowB = source.V1.AsByte(); + Vector128 rowC = source.V2.AsByte(); + Vector128 rowD = source.V3.AsByte(); + Vector128 rowE = source.V4.AsByte(); + Vector128 rowF = source.V5.AsByte(); + Vector128 rowG = source.V6.AsByte(); + Vector128 rowH = source.V7.AsByte(); // row0 - Vector128 row0_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16(); - Vector128 row0_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16(); - Vector128 row0 = Sse2.Or(row0_A, row0_B); - Vector128 row0_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16(); - row0 = Sse2.Or(row0, row0_C); + Vector128 row0A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16(); + Vector128 row0B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16(); + Vector128 row0 = Sse2.Or(row0A, row0B); + Vector128 row0C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16(); + row0 = Sse2.Or(row0, row0C); // row1 - Vector128 row1_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16(); - Vector128 row1_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16(); - Vector128 row1 = Sse2.Or(row1_A, row1_B); - Vector128 row1_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1_C); - Vector128 row1_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1_D); - Vector128 row1_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1_E); + Vector128 row1A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16(); + Vector128 row1B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16(); + Vector128 row1 = Sse2.Or(row1A, row1B); + Vector128 row1C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1C); + Vector128 row1D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1D); + Vector128 row1E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1E); // row2 - Vector128 row2_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16(); - Vector128 row2_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16(); - Vector128 row2 = Sse2.Or(row2_B, row2_C); - Vector128 row2_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_D); - Vector128 row2_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_E); - Vector128 row2_F = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_F); - Vector128 row2_G = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_G); + Vector128 row2B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16(); + Vector128 row2C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16(); + Vector128 row2 = Sse2.Or(row2B, row2C); + Vector128 row2D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2D); + Vector128 row2E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2E); + Vector128 row2F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2F); + Vector128 row2G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2G); // row3 - Vector128 A_3 = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16(); - Vector128 B_3 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16(); - Vector128 row3 = Sse2.Or(A_3, B_3); - Vector128 C_3 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); - row3 = Sse2.Or(row3, C_3); - Vector128 D3_E4_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16)); - Vector128 D_3 = Ssse3.Shuffle(D, D3_E4_shuffleMask).AsInt16(); - row3 = Sse2.Or(row3, D_3); + Vector128 row3A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16(); + Vector128 row3B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16(); + Vector128 row3 = Sse2.Or(row3A, row3B); + Vector128 row3C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); + row3 = Sse2.Or(row3, row3C); + Vector128 row3D_row4E_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16)); + Vector128 row3D = Ssse3.Shuffle(rowD, row3D_row4E_shuffleMask).AsInt16(); + row3 = Sse2.Or(row3, row3D); // row4 - Vector128 E_4 = Ssse3.Shuffle(E, D3_E4_shuffleMask).AsInt16(); - Vector128 F_4 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16(); - Vector128 row4 = Sse2.Or(E_4, F_4); - Vector128 G_4 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16(); - row4 = Sse2.Or(row4, G_4); - Vector128 H_4 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16(); - row4 = Sse2.Or(row4, H_4); + Vector128 row4E = Ssse3.Shuffle(rowE, row3D_row4E_shuffleMask).AsInt16(); + Vector128 row4F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16(); + Vector128 row4 = Sse2.Or(row4E, row4F); + Vector128 row4G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16(); + row4 = Sse2.Or(row4, row4G); + Vector128 row4H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16(); + row4 = Sse2.Or(row4, row4H); // row5 - Vector128 B_5 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16(); - Vector128 C_5 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16(); - Vector128 row5 = Sse2.Or(B_5, C_5); - Vector128 D_5 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16(); - row5 = Sse2.Or(row5, D_5); - Vector128 E_5 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16(); - row5 = Sse2.Or(row5, E_5); - Vector128 F_5 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16(); - row5 = Sse2.Or(row5, F_5); - Vector128 G_5 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16(); - row5 = Sse2.Or(row5, G_5); + Vector128 row5B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16(); + Vector128 row5C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16(); + Vector128 row5 = Sse2.Or(row5B, row5C); + Vector128 row5D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5D); + Vector128 row5E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5E); + Vector128 row5F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5F); + Vector128 row5G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5G); // row6 - Vector128 D_6 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16(); - Vector128 E_6 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16(); - Vector128 row6 = Sse2.Or(D_6, E_6); - Vector128 F_6 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16(); - row6 = Sse2.Or(row6, F_6); - Vector128 G_6 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16(); - row6 = Sse2.Or(row6, G_6); - Vector128 H_6 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16(); - row6 = Sse2.Or(row6, H_6); + Vector128 row6D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16(); + Vector128 row6E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16(); + Vector128 row6 = Sse2.Or(row6D, row6E); + Vector128 row6F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16(); + row6 = Sse2.Or(row6, row6F); + Vector128 row6G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16(); + row6 = Sse2.Or(row6, row6G); + Vector128 row6H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16(); + row6 = Sse2.Or(row6, row6H); // row7 - Vector128 F_7 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16(); - Vector128 G_7 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16(); - Vector128 row7 = Sse2.Or(F_7, G_7); - Vector128 H_7 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16(); - row7 = Sse2.Or(row7, H_7); + Vector128 row7F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16(); + Vector128 row7G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16(); + Vector128 row7 = Sse2.Or(row7F, row7G); + Vector128 row7H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16(); + row7 = Sse2.Or(row7, row7H); dest.V0 = row0; dest.V1 = row1; @@ -292,105 +275,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components fixed (byte* shuffleVectorsPtr = AvxShuffleMasks) { - // 18 loads - // 10 cross-lane shuffles (permutations) - // 14 shuffles - // 10 bitwise or's - // 4 stores - - // A0 A1 A2 A3 A4 A5 A6 A7 | B0 B1 B2 B3 B4 B5 B6 B7 - // C0 C1 C2 C3 C4 C5 C6 C7 | D0 D1 D2 D3 D4 D5 D6 D7 - // E0 E1 E2 E3 E4 E5 E6 E7 | F0 F1 F2 F3 F4 F5 F6 F7 - // G0 G1 G2 G3 G4 G5 G6 G7 | H0 H1 H2 H3 H4 H5 H6 H7 - Vector256 AB = source.V01.AsByte(); - Vector256 CD = source.V23.AsByte(); - Vector256 EF = source.V45.AsByte(); - Vector256 GH = source.V67.AsByte(); - - // row01 - A0 A1 B0 C0 B1 A2 A3 B2 | C1 D0 E0 D1 C2 B3 A4 A5 - Vector256 AB01_EF01_CD23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); - - // row01_AB - (A0 A1) (B0 B1) (A2 A3) (B2 B3) | (B2 B3) (A4 A5) (X X) (X X) - Vector256 row01_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); - // row01_AB - (A0 A1) (B0 X) (B1 A2) (A3 B2) | (X X) (X X) (X B3) (A4 A5) + Vector256 rowsAB = source.V01.AsByte(); + Vector256 rowsCD = source.V23.AsByte(); + Vector256 rowsEF = source.V45.AsByte(); + Vector256 rowsGH = source.V67.AsByte(); + + // rows 0 1 + Vector256 rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); + Vector256 row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte(); - Vector256 CD01_GH23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32(); - - // row01_CD - (C0 C1) (X X) (X X) (X X) | (C0 C1) (D0 D1) (C2 C3) (X X) - Vector256 row01_CD = Avx2.PermuteVar8x32(CD.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte(); - // row01_CD - (X X) (X C0) (X X) (X X) | (C1 D0) (X D1) (C2 X) (X X) + Vector256 rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32(); + Vector256 row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte(); - // row01_EF - (E0 E1) (E2 E3) (F0 F1) (X X) | (E0 E1) (X X) (X X) (X X) - Vector256 row0123_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); - // row01_EF - (X X) (X X) (X X) (X X) | (X X) (E0 X) (X X) (X X) + Vector256 row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); Vector256 row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte(); Vector256 row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF); - - // row23 - B4 C3 D2 E1 F0 G0 F1 E2 | D3 C4 B5 A6 A7 B6 C5 D4 - - Vector256 AB23_CD45_EF67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); - - // row23_AB - (B4 B5) (X X) (X X) (X X) | (B4 B5) (B6 B7) (A6 A7) (X X) - Vector256 row2345_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); - // row23_AB - (B4 X) (X X) (X X) (X X) | (X X) (B5 A6) (A7 B6) (X X) + // rows 2 3 + Vector256 rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); + Vector256 row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); Vector256 row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte(); - // row23_CD - (C2 C3) (D2 D3) (X X) (X X) | (D2 D3) (C4 C5) (D4 D5) (X X) - Vector256 row23_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); - // row23_CD - (X C3) (D2 X) (X X) (X X) | (D3 C4) (X X) (X X) (C5 D4) + Vector256 row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte(); - // row23_EF - (X X) (X E1) (F0 X) (F1 E2) | (X X) (X X) (X X) (X X) Vector256 row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte(); - // row23_GH - (G0 G1) (G2 G3) (H0 H1) (X X) | (G2 G3) (X X) (X X) (X X) - Vector256 row2345_GH = Avx2.PermuteVar8x32(GH.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte(); - // row23_GH - (X X) (X X) (X G0) (X X) | (X X) (X X) (X X) (X X) + Vector256 row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); Vector256 row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte()); Vector256 row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH)); - - // row45 - E3 F2 G1 H0 H1 G2 F3 E4 | D5 C6 B7 C7 D6 E5 F4 G3 - - // row45_AB - (X X) (X X) (X X) (X X) | (X X) (B7 X) (X X) (X X) + // rows 4 5 Vector256 row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte()); - - // row45_CD - (D6 D7) (X X) (X X) (X X) | (C6 C7) (D4 D5) (D6 D7) (X X) - Vector256 row4567_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); - // row45_CD - (X X) (X X) (X X) (X X) | (D5 C6) (X C7) (D6 X) (X X) + Vector256 row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); Vector256 row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte()); - Vector256 EF45_GH67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32(); - - // row45_EF - (E2 E3) (E4 E5) (F2 F3) (X X) | (E4 E5) (F4 F5) (X X) (X X) - Vector256 row45_EF = Avx2.PermuteVar8x32(EF.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte(); - // row45_EF - (E3 F2) (X X) (X X) (F3 E4) | (X X) (X X) (X E5) (F4 X) + Vector256 rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32(); + Vector256 row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte()); - // row45_GH - (X X) (G1 H0) (H1 G2) (X X) | (X X) (X X) (X X) (X G3) Vector256 row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte()); Vector256 row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH)); - - // row67 - H2 H3 G4 F5 E6 D7 E7 F6 | G5 H4 H5 G6 F7 G7 H6 H7 - - // row67_CD - (X X) (X X) (X D7) (X X) | (X X) (X X) (X X) (X X) + // rows 6 7 Vector256 row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte()); - // row67_EF - (E6 E7) (F4 F5) (F6 F7) (X X) | (F6 F7) (X X) (X X) (X X) - Vector256 row67_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); - // row67_EF - (X X) (X F5) (E6 X) (E7 F6) | (X X) (X X) (F7 X) (X X) + Vector256 row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte()); - // row67_GH - (G4 G5) (H2 H3) (X X) (X X) | (G4 G5) (G6 G7) (H4 H5) (H6 H7) - Vector256 row67_GH = Avx2.PermuteVar8x32(GH.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte(); - // row67_GH - (H2 H3) (G4 X) (X X) (X X) | (G5 H4) (H5 G6) (X G7) (H6 H7) + Vector256 row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte()); Vector256 row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH); diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs index b826193c3..898bbdb45 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs @@ -9,8 +9,8 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class Block8x8F_Quantize { - private Block8x8F block = default; - private Block8x8F quant = default; + private Block8x8F block = CreateFromScalar(1); + private Block8x8F quant = CreateFromScalar(1); private Block8x8 result = default; [Benchmark] @@ -19,5 +19,32 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations Block8x8F.Quantize(ref this.block, ref this.result, ref this.quant); return this.result[0]; } + + private static Block8x8F CreateFromScalar(float scalar) + { + Block8x8F block = default; + for (int i = 0; i < 64; i++) + { + block[i] = scalar; + } + + return block; + } } } + +/* +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update) +Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores +.NET SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 2. SSE : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 3. AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + +| Method | Job | Mean | Error | StdDev | Ratio | +|--------- |-----------------|---------:|---------:|---------:|------:| +| Quantize | No HwIntrinsics | 73.34 ns | 1.081 ns | 1.011 ns | 1.00 | +| Quantize | SSE | 24.11 ns | 0.298 ns | 0.279 ns | 0.33 | +| Quantize | AVX | 15.90 ns | 0.074 ns | 0.065 ns | 0.22 | + */ diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs index 47f7d2fbc..28899b51e 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -33,3 +33,17 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations } } } + +/* +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update) +Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores +.NET SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + +| Method | Job | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated | +|-------------- |---------------- |----------:|----------:|----------:|------:|------:|------:|------:|----------:| +| TransposeInto | No HwIntrinsics | 19.658 ns | 0.0550 ns | 0.0515 ns | 1.00 | - | - | - | - | +| TransposeInto | AVX | 8.613 ns | 0.0249 ns | 0.0208 ns | 0.44 | - | - | - | - | +*/ diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs index 5ceb4c8a0..ffe0f4c02 100644 --- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs +++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs @@ -65,17 +65,17 @@ namespace SixLabors.ImageSharp.Benchmarks .WithId("1. No HwIntrinsics").AsBaseline()); #if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) + if (Sse.IsSupported) { this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithId("2. AVX")); + .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) + .WithId("2. SSE")); } - if (Sse.IsSupported) + if (Avx.IsSupported) { this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) - .WithId("3. SSE")); + .WithId("3. AVX")); } #endif }