Browse Source

Tidied up the code, added benchmarks

pull/1761/head
Dmitry Pentin 4 years ago
parent
commit
d21e374e86
  1. 2
      src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
  2. 29
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
  3. 172
      src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
  4. 173
      src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
  5. 290
      src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
  6. 31
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs
  7. 14
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
  8. 10
      tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs

2
src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs

@ -5,8 +5,10 @@ using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
using System.Text;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components

29
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs

@ -35,33 +35,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
[FieldOffset(224)]
public Vector256<float> V7;
private static ReadOnlySpan<int> DivideIntoInt16_Avx2_ShuffleMask => new int[] {
0, 1, 4, 5, 2, 3, 6, 7
};
private static readonly Vector256<int> MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
fixed (int* maskPtr = DivideIntoInt16_Avx2_ShuffleMask)
{
Vector256<int> crossLaneShuffleMask = Avx.LoadVector256(maskPtr).AsInt32();
ref Vector256<float> aBase = ref Unsafe.As<Block8x8F, Vector256<float>>(ref a);
ref Vector256<float> bBase = ref Unsafe.As<Block8x8F, Vector256<float>>(ref b);
ref Vector256<float> aBase = ref a.V0;
ref Vector256<float> bBase = ref b.V0;
ref Vector256<short> destBase = ref Unsafe.As<Block8x8, Vector256<short>>(ref dest);
ref Vector256<short> destRef = ref dest.V01;
for (int i = 0; i < 8; i += 2)
{
Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
for (int i = 0; i < 8; i += 2)
{
Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16();
Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16();
Unsafe.Add(ref destBase, i / 2) = row;
}
Unsafe.Add(ref destRef, i / 2) = row;
}
}

172
src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs

@ -2,7 +2,6 @@
// Licensed under the Apache License, Version 2.0.
#if SUPPORTS_RUNTIME_INTRINSICS
using System;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
@ -37,42 +36,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
private static readonly Vector256<float> mm256_F_0_7653 = Vector256.Create(0.765366865f);
#pragma warning restore SA1310, SA1311, IDE1006
/// <summary>
/// Gets reciprocal coefficients for jpeg quantization tables calculation.
/// </summary>
/// <remarks>
/// <para>
/// Current FDCT implementation expects its results to be multiplied by
/// a reciprocal quantization table. Values in this table must be divided
/// by quantization table values scaled with quality settings.
/// </para>
/// <para>
/// These values were calculates with this formula:
/// <code>
/// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
/// </code>
/// Where:
/// <code>
/// scalefactor[0] = 1
/// </code>
/// <code>
/// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
/// </code>
/// Values are also scaled by 8 so DCT code won't do unnecessary division.
/// </para>
/// </remarks>
public static ReadOnlySpan<float> DctReciprocalAdjustmentCoefficients => new float[]
{
0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
};
/// <summary>
/// Apply floating point FDCT inplace using simd operations.
/// </summary>
@ -217,141 +180,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
block.V7 = Avx.Subtract(z11, z4);
}
/// <summary>
/// Performs 8x8 matrix Inverse Discrete Cosine Transform
/// </summary>
/// <param name="s">Source</param>
/// <param name="d">Destination</param>
public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{
IDCT8x8_Avx(ref s, ref d);
}
else
#endif
{
IDCT8x4_LeftPart(ref s, ref d);
IDCT8x4_RightPart(ref s, ref d);
}
}
/// <summary>
/// Do IDCT internal operations on the left part of the block. Original src:
/// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
/// </summary>
/// <param name="s">The source block</param>
/// <param name="d">Destination block</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
{
Vector4 my1 = s.V1L;
Vector4 my7 = s.V7L;
Vector4 mz0 = my1 + my7;
Vector4 my3 = s.V3L;
Vector4 mz2 = my3 + my7;
Vector4 my5 = s.V5L;
Vector4 mz1 = my3 + my5;
Vector4 mz3 = my1 + my5;
Vector4 mz4 = (mz0 + mz1) * C_1_175876;
mz2 = (mz2 * C_1_961571) + mz4;
mz3 = (mz3 * C_0_390181) + mz4;
mz0 = mz0 * C_0_899976;
mz1 = mz1 * C_2_562915;
Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
Vector4 my2 = s.V2L;
Vector4 my6 = s.V6L;
mz4 = (my2 + my6) * C_0_541196;
Vector4 my0 = s.V0L;
Vector4 my4 = s.V4L;
mz0 = my0 + my4;
mz1 = my0 - my4;
mz2 = mz4 + (my6 * C_1_847759);
mz3 = mz4 + (my2 * C_0_765367);
my0 = mz0 + mz3;
my3 = mz0 - mz3;
my1 = mz1 + mz2;
my2 = mz1 - mz2;
d.V0L = my0 + mb0;
d.V7L = my0 - mb0;
d.V1L = my1 + mb1;
d.V6L = my1 - mb1;
d.V2L = my2 + mb2;
d.V5L = my2 - mb2;
d.V3L = my3 + mb3;
d.V4L = my3 - mb3;
}
/// <summary>
/// Do IDCT internal operations on the right part of the block.
/// Original src:
/// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
/// </summary>
/// <param name="s">The source block</param>
/// <param name="d">The destination block</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
{
Vector4 my1 = s.V1R;
Vector4 my7 = s.V7R;
Vector4 mz0 = my1 + my7;
Vector4 my3 = s.V3R;
Vector4 mz2 = my3 + my7;
Vector4 my5 = s.V5R;
Vector4 mz1 = my3 + my5;
Vector4 mz3 = my1 + my5;
Vector4 mz4 = (mz0 + mz1) * C_1_175876;
mz2 = (mz2 * C_1_961571) + mz4;
mz3 = (mz3 * C_0_390181) + mz4;
mz0 = mz0 * C_0_899976;
mz1 = mz1 * C_2_562915;
Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
Vector4 my2 = s.V2R;
Vector4 my6 = s.V6R;
mz4 = (my2 + my6) * C_0_541196;
Vector4 my0 = s.V0R;
Vector4 my4 = s.V4R;
mz0 = my0 + my4;
mz1 = my0 - my4;
mz2 = mz4 + (my6 * C_1_847759);
mz3 = mz4 + (my2 * C_0_765367);
my0 = mz0 + mz3;
my3 = mz0 - mz3;
my1 = mz1 + mz2;
my2 = mz1 - mz2;
d.V0R = my0 + mb0;
d.V7R = my0 - mb0;
d.V1R = my1 + mb1;
d.V6R = my1 - mb1;
d.V2R = my2 + mb2;
d.V5R = my2 - mb2;
d.V3R = my3 + mb3;
d.V4R = my3 - mb3;
}
/// <summary>
/// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
/// using AVX commands.

173
src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs

@ -1,6 +1,8 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics.X86;
@ -42,6 +44,42 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
private const float C_0_125 = 0.1250f;
#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
/// <summary>
/// Gets reciprocal coefficients for jpeg quantization tables calculation.
/// </summary>
/// <remarks>
/// <para>
/// Current FDCT implementation expects its results to be multiplied by
/// a reciprocal quantization table. Values in this table must be divided
/// by quantization table values scaled with quality settings.
/// </para>
/// <para>
/// These values were calculates with this formula:
/// <code>
/// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
/// </code>
/// Where:
/// <code>
/// scalefactor[0] = 1
/// </code>
/// <code>
/// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
/// </code>
/// Values are also scaled by 8 so DCT code won't do unnecessary division.
/// </para>
/// </remarks>
public static ReadOnlySpan<float> DctReciprocalAdjustmentCoefficients => new float[]
{
0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
};
/// <summary>
/// Apply floating point IDCT inplace.
/// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
@ -186,5 +224,140 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
dataRef = ref Unsafe.Add(ref dataRef, 1);
}
}
/// <summary>
/// Performs 8x8 matrix Inverse Discrete Cosine Transform
/// </summary>
/// <param name="s">Source</param>
/// <param name="d">Destination</param>
public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{
IDCT8x8_Avx(ref s, ref d);
}
else
#endif
{
IDCT8x4_LeftPart(ref s, ref d);
IDCT8x4_RightPart(ref s, ref d);
}
}
/// <summary>
/// Do IDCT internal operations on the left part of the block. Original src:
/// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
/// </summary>
/// <param name="s">The source block</param>
/// <param name="d">Destination block</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
{
Vector4 my1 = s.V1L;
Vector4 my7 = s.V7L;
Vector4 mz0 = my1 + my7;
Vector4 my3 = s.V3L;
Vector4 mz2 = my3 + my7;
Vector4 my5 = s.V5L;
Vector4 mz1 = my3 + my5;
Vector4 mz3 = my1 + my5;
Vector4 mz4 = (mz0 + mz1) * C_1_175876;
mz2 = (mz2 * C_1_961571) + mz4;
mz3 = (mz3 * C_0_390181) + mz4;
mz0 = mz0 * C_0_899976;
mz1 = mz1 * C_2_562915;
Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
Vector4 my2 = s.V2L;
Vector4 my6 = s.V6L;
mz4 = (my2 + my6) * C_0_541196;
Vector4 my0 = s.V0L;
Vector4 my4 = s.V4L;
mz0 = my0 + my4;
mz1 = my0 - my4;
mz2 = mz4 + (my6 * C_1_847759);
mz3 = mz4 + (my2 * C_0_765367);
my0 = mz0 + mz3;
my3 = mz0 - mz3;
my1 = mz1 + mz2;
my2 = mz1 - mz2;
d.V0L = my0 + mb0;
d.V7L = my0 - mb0;
d.V1L = my1 + mb1;
d.V6L = my1 - mb1;
d.V2L = my2 + mb2;
d.V5L = my2 - mb2;
d.V3L = my3 + mb3;
d.V4L = my3 - mb3;
}
/// <summary>
/// Do IDCT internal operations on the right part of the block.
/// Original src:
/// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
/// </summary>
/// <param name="s">The source block</param>
/// <param name="d">The destination block</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
{
Vector4 my1 = s.V1R;
Vector4 my7 = s.V7R;
Vector4 mz0 = my1 + my7;
Vector4 my3 = s.V3R;
Vector4 mz2 = my3 + my7;
Vector4 my5 = s.V5R;
Vector4 mz1 = my3 + my5;
Vector4 mz3 = my1 + my5;
Vector4 mz4 = (mz0 + mz1) * C_1_175876;
mz2 = (mz2 * C_1_961571) + mz4;
mz3 = (mz3 * C_0_390181) + mz4;
mz0 = mz0 * C_0_899976;
mz1 = mz1 * C_2_562915;
Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
Vector4 my2 = s.V2R;
Vector4 my6 = s.V6R;
mz4 = (my2 + my6) * C_0_541196;
Vector4 my0 = s.V0R;
Vector4 my4 = s.V4R;
mz0 = my0 + my4;
mz1 = my0 - my4;
mz2 = mz4 + (my6 * C_1_847759);
mz3 = mz4 + (my2 * C_0_765367);
my0 = mz0 + mz3;
my3 = mz0 - mz3;
my1 = mz1 + mz2;
my2 = mz1 - mz2;
d.V0R = my0 + mb0;
d.V7R = my0 - mb0;
d.V1R = my1 + mb1;
d.V6R = my1 - mb1;
d.V2R = my2 + mb2;
d.V5R = my2 - mb2;
d.V3R = my3 + mb3;
d.V4R = my3 - mb3;
}
}
}

290
src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs

@ -23,82 +23,65 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
/// </summary>
private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
{
// 0_A
// row0
// A B C
0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
// 0_B
_, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
// 0_C
_, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
// 1_A
// row1
// A B C D E
_, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
// 1_B
_, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _,
// 1_C
2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
// 1_D
_, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
// 1_E
_, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _,
// 2_B
// row2
// B C D E F G
8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// 2_C
_, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
// 2_D
_, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _,
// 2_E
_, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
// 2_F
_, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
// 2_G
_, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _,
// 3_A
// row3
// A B C D
// D shuffle mask is the for row4 E row shuffle mask
_, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
// 3_B
_, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
// 3_C
_, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
// 3_D/4_E
6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
// 4_F
// row4
// E F G H
// 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
_, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
// 4_G
_, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
// 4_H
_, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
// 5_B
// row5
// B C D E F G
_, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _,
// 5_C
_, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
// 5_D
10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
// 5_E
_, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _,
// 5_F
_, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _,
// 5_G
_, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7,
// 6_D
// row6
// D E F G H
_, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _,
// 6_E
_, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
// 6_F
_, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
// 6_G
_, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _,
// 6_H
4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
// 7_F
// row7
// F G H
_, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _,
// 7_G
10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
// 7_H
_, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
};
@ -177,95 +160,95 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
fixed (byte* maskPtr = SseShuffleMasks)
{
Vector128<byte> A = source.V0.AsByte();
Vector128<byte> B = source.V1.AsByte();
Vector128<byte> C = source.V2.AsByte();
Vector128<byte> D = source.V3.AsByte();
Vector128<byte> E = source.V4.AsByte();
Vector128<byte> F = source.V5.AsByte();
Vector128<byte> G = source.V6.AsByte();
Vector128<byte> H = source.V7.AsByte();
Vector128<byte> rowA = source.V0.AsByte();
Vector128<byte> rowB = source.V1.AsByte();
Vector128<byte> rowC = source.V2.AsByte();
Vector128<byte> rowD = source.V3.AsByte();
Vector128<byte> rowE = source.V4.AsByte();
Vector128<byte> rowF = source.V5.AsByte();
Vector128<byte> rowG = source.V6.AsByte();
Vector128<byte> rowH = source.V7.AsByte();
// row0
Vector128<short> row0_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16();
Vector128<short> row0_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16();
Vector128<short> row0 = Sse2.Or(row0_A, row0_B);
Vector128<short> row0_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16();
row0 = Sse2.Or(row0, row0_C);
Vector128<short> row0A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16();
Vector128<short> row0B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16();
Vector128<short> row0 = Sse2.Or(row0A, row0B);
Vector128<short> row0C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16();
row0 = Sse2.Or(row0, row0C);
// row1
Vector128<short> row1_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16();
Vector128<short> row1_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16();
Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
Vector128<short> row1_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16();
row1 = Sse2.Or(row1, row1_C);
Vector128<short> row1_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16();
row1 = Sse2.Or(row1, row1_D);
Vector128<short> row1_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16();
row1 = Sse2.Or(row1, row1_E);
Vector128<short> row1A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16();
Vector128<short> row1B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16();
Vector128<short> row1 = Sse2.Or(row1A, row1B);
Vector128<short> row1C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16();
row1 = Sse2.Or(row1, row1C);
Vector128<short> row1D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16();
row1 = Sse2.Or(row1, row1D);
Vector128<short> row1E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16();
row1 = Sse2.Or(row1, row1E);
// row2
Vector128<short> row2_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16();
Vector128<short> row2_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16();
Vector128<short> row2 = Sse2.Or(row2_B, row2_C);
Vector128<short> row2_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16();
row2 = Sse2.Or(row2, row2_D);
Vector128<short> row2_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16();
row2 = Sse2.Or(row2, row2_E);
Vector128<short> row2_F = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16();
row2 = Sse2.Or(row2, row2_F);
Vector128<short> row2_G = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16();
row2 = Sse2.Or(row2, row2_G);
Vector128<short> row2B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16();
Vector128<short> row2C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16();
Vector128<short> row2 = Sse2.Or(row2B, row2C);
Vector128<short> row2D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16();
row2 = Sse2.Or(row2, row2D);
Vector128<short> row2E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16();
row2 = Sse2.Or(row2, row2E);
Vector128<short> row2F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16();
row2 = Sse2.Or(row2, row2F);
Vector128<short> row2G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16();
row2 = Sse2.Or(row2, row2G);
// row3
Vector128<short> A_3 = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16();
Vector128<short> B_3 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16();
Vector128<short> row3 = Sse2.Or(A_3, B_3);
Vector128<short> C_3 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
row3 = Sse2.Or(row3, C_3);
Vector128<byte> D3_E4_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16));
Vector128<short> D_3 = Ssse3.Shuffle(D, D3_E4_shuffleMask).AsInt16();
row3 = Sse2.Or(row3, D_3);
Vector128<short> row3A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16();
Vector128<short> row3B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16();
Vector128<short> row3 = Sse2.Or(row3A, row3B);
Vector128<short> row3C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
row3 = Sse2.Or(row3, row3C);
Vector128<byte> row3D_row4E_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16));
Vector128<short> row3D = Ssse3.Shuffle(rowD, row3D_row4E_shuffleMask).AsInt16();
row3 = Sse2.Or(row3, row3D);
// row4
Vector128<short> E_4 = Ssse3.Shuffle(E, D3_E4_shuffleMask).AsInt16();
Vector128<short> F_4 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16();
Vector128<short> row4 = Sse2.Or(E_4, F_4);
Vector128<short> G_4 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16();
row4 = Sse2.Or(row4, G_4);
Vector128<short> H_4 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16();
row4 = Sse2.Or(row4, H_4);
Vector128<short> row4E = Ssse3.Shuffle(rowE, row3D_row4E_shuffleMask).AsInt16();
Vector128<short> row4F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16();
Vector128<short> row4 = Sse2.Or(row4E, row4F);
Vector128<short> row4G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16();
row4 = Sse2.Or(row4, row4G);
Vector128<short> row4H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16();
row4 = Sse2.Or(row4, row4H);
// row5
Vector128<short> B_5 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16();
Vector128<short> C_5 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16();
Vector128<short> row5 = Sse2.Or(B_5, C_5);
Vector128<short> D_5 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16();
row5 = Sse2.Or(row5, D_5);
Vector128<short> E_5 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16();
row5 = Sse2.Or(row5, E_5);
Vector128<short> F_5 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16();
row5 = Sse2.Or(row5, F_5);
Vector128<short> G_5 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16();
row5 = Sse2.Or(row5, G_5);
Vector128<short> row5B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16();
Vector128<short> row5C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16();
Vector128<short> row5 = Sse2.Or(row5B, row5C);
Vector128<short> row5D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16();
row5 = Sse2.Or(row5, row5D);
Vector128<short> row5E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16();
row5 = Sse2.Or(row5, row5E);
Vector128<short> row5F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16();
row5 = Sse2.Or(row5, row5F);
Vector128<short> row5G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16();
row5 = Sse2.Or(row5, row5G);
// row6
Vector128<short> D_6 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16();
Vector128<short> E_6 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16();
Vector128<short> row6 = Sse2.Or(D_6, E_6);
Vector128<short> F_6 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16();
row6 = Sse2.Or(row6, F_6);
Vector128<short> G_6 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16();
row6 = Sse2.Or(row6, G_6);
Vector128<short> H_6 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16();
row6 = Sse2.Or(row6, H_6);
Vector128<short> row6D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16();
Vector128<short> row6E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16();
Vector128<short> row6 = Sse2.Or(row6D, row6E);
Vector128<short> row6F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16();
row6 = Sse2.Or(row6, row6F);
Vector128<short> row6G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16();
row6 = Sse2.Or(row6, row6G);
Vector128<short> row6H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16();
row6 = Sse2.Or(row6, row6H);
// row7
Vector128<short> F_7 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16();
Vector128<short> G_7 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16();
Vector128<short> row7 = Sse2.Or(F_7, G_7);
Vector128<short> H_7 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16();
row7 = Sse2.Or(row7, H_7);
Vector128<short> row7F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16();
Vector128<short> row7G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16();
Vector128<short> row7 = Sse2.Or(row7F, row7G);
Vector128<short> row7H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16();
row7 = Sse2.Or(row7, row7H);
dest.V0 = row0;
dest.V1 = row1;
@ -292,105 +275,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
{
// 18 loads
// 10 cross-lane shuffles (permutations)
// 14 shuffles
// 10 bitwise or's
// 4 stores
// A0 A1 A2 A3 A4 A5 A6 A7 | B0 B1 B2 B3 B4 B5 B6 B7
// C0 C1 C2 C3 C4 C5 C6 C7 | D0 D1 D2 D3 D4 D5 D6 D7
// E0 E1 E2 E3 E4 E5 E6 E7 | F0 F1 F2 F3 F4 F5 F6 F7
// G0 G1 G2 G3 G4 G5 G6 G7 | H0 H1 H2 H3 H4 H5 H6 H7
Vector256<byte> AB = source.V01.AsByte();
Vector256<byte> CD = source.V23.AsByte();
Vector256<byte> EF = source.V45.AsByte();
Vector256<byte> GH = source.V67.AsByte();
// row01 - A0 A1 B0 C0 B1 A2 A3 B2 | C1 D0 E0 D1 C2 B3 A4 A5
Vector256<int> AB01_EF01_CD23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
// row01_AB - (A0 A1) (B0 B1) (A2 A3) (B2 B3) | (B2 B3) (A4 A5) (X X) (X X)
Vector256<byte> row01_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
// row01_AB - (A0 A1) (B0 X) (B1 A2) (A3 B2) | (X X) (X X) (X B3) (A4 A5)
Vector256<byte> rowsAB = source.V01.AsByte();
Vector256<byte> rowsCD = source.V23.AsByte();
Vector256<byte> rowsEF = source.V45.AsByte();
Vector256<byte> rowsGH = source.V67.AsByte();
// rows 0 1
Vector256<int> rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
Vector256<byte> row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte();
Vector256<int> CD01_GH23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
// row01_CD - (C0 C1) (X X) (X X) (X X) | (C0 C1) (D0 D1) (C2 C3) (X X)
Vector256<byte> row01_CD = Avx2.PermuteVar8x32(CD.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte();
// row01_CD - (X X) (X C0) (X X) (X X) | (C1 D0) (X D1) (C2 X) (X X)
Vector256<int> rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
Vector256<byte> row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte();
// row01_EF - (E0 E1) (E2 E3) (F0 F1) (X X) | (E0 E1) (X X) (X X) (X X)
Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
// row01_EF - (X X) (X X) (X X) (X X) | (X X) (E0 X) (X X) (X X)
Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
Vector256<byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
Vector256<byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF);
// row23 - B4 C3 D2 E1 F0 G0 F1 E2 | D3 C4 B5 A6 A7 B6 C5 D4
Vector256<int> AB23_CD45_EF67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
// row23_AB - (B4 B5) (X X) (X X) (X X) | (B4 B5) (B6 B7) (A6 A7) (X X)
Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
// row23_AB - (B4 X) (X X) (X X) (X X) | (X X) (B5 A6) (A7 B6) (X X)
// rows 2 3
Vector256<int> rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
Vector256<byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
// row23_CD - (C2 C3) (D2 D3) (X X) (X X) | (D2 D3) (C4 C5) (D4 D5) (X X)
Vector256<byte> row23_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte();
// row23_CD - (X C3) (D2 X) (X X) (X X) | (D3 C4) (X X) (X X) (C5 D4)
Vector256<byte> row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte();
// row23_EF - (X X) (X E1) (F0 X) (F1 E2) | (X X) (X X) (X X) (X X)
Vector256<byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
// row23_GH - (G0 G1) (G2 G3) (H0 H1) (X X) | (G2 G3) (X X) (X X) (X X)
Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(GH.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte();
// row23_GH - (X X) (X X) (X G0) (X X) | (X X) (X X) (X X) (X X)
Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
Vector256<byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte());
Vector256<byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH));
// row45 - E3 F2 G1 H0 H1 G2 F3 E4 | D5 C6 B7 C7 D6 E5 F4 G3
// row45_AB - (X X) (X X) (X X) (X X) | (X X) (B7 X) (X X) (X X)
// rows 4 5
Vector256<byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte());
// row45_CD - (D6 D7) (X X) (X X) (X X) | (C6 C7) (D4 D5) (D6 D7) (X X)
Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
// row45_CD - (X X) (X X) (X X) (X X) | (D5 C6) (X C7) (D6 X) (X X)
Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
Vector256<byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte());
Vector256<int> EF45_GH67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
// row45_EF - (E2 E3) (E4 E5) (F2 F3) (X X) | (E4 E5) (F4 F5) (X X) (X X)
Vector256<byte> row45_EF = Avx2.PermuteVar8x32(EF.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte();
// row45_EF - (E3 F2) (X X) (X X) (F3 E4) | (X X) (X X) (X E5) (F4 X)
Vector256<int> rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
Vector256<byte> row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte());
// row45_GH - (X X) (G1 H0) (H1 G2) (X X) | (X X) (X X) (X X) (X G3)
Vector256<byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte());
Vector256<byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH));
// row67 - H2 H3 G4 F5 E6 D7 E7 F6 | G5 H4 H5 G6 F7 G7 H6 H7
// row67_CD - (X X) (X X) (X D7) (X X) | (X X) (X X) (X X) (X X)
// rows 6 7
Vector256<byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte());
// row67_EF - (E6 E7) (F4 F5) (F6 F7) (X X) | (F6 F7) (X X) (X X) (X X)
Vector256<byte> row67_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte();
// row67_EF - (X X) (X F5) (E6 X) (E7 F6) | (X X) (X X) (F7 X) (X X)
Vector256<byte> row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte());
// row67_GH - (G4 G5) (H2 H3) (X X) (X X) | (G4 G5) (G6 G7) (H4 H5) (H6 H7)
Vector256<byte> row67_GH = Avx2.PermuteVar8x32(GH.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte();
// row67_GH - (H2 H3) (G4 X) (X X) (X X) | (G5 H4) (H5 G6) (X G7) (H6 H7)
Vector256<byte> row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte());
Vector256<byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH);

31
tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs

@ -9,8 +9,8 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
[Config(typeof(Config.HwIntrinsics_SSE_AVX))]
public class Block8x8F_Quantize
{
private Block8x8F block = default;
private Block8x8F quant = default;
private Block8x8F block = CreateFromScalar(1);
private Block8x8F quant = CreateFromScalar(1);
private Block8x8 result = default;
[Benchmark]
@ -19,5 +19,32 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
Block8x8F.Quantize(ref this.block, ref this.result, ref this.quant);
return this.result[0];
}
private static Block8x8F CreateFromScalar(float scalar)
{
Block8x8F block = default;
for (int i = 0; i < 64; i++)
{
block[i] = scalar;
}
return block;
}
}
}
/*
BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.100-preview.3.21202.5
[Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
2. SSE : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
3. AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
| Method | Job | Mean | Error | StdDev | Ratio |
|--------- |-----------------|---------:|---------:|---------:|------:|
| Quantize | No HwIntrinsics | 73.34 ns | 1.081 ns | 1.011 ns | 1.00 |
| Quantize | SSE | 24.11 ns | 0.298 ns | 0.279 ns | 0.33 |
| Quantize | AVX | 15.90 ns | 0.074 ns | 0.065 ns | 0.22 |
*/

14
tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs

@ -33,3 +33,17 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
}
}
}
/*
BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.100-preview.3.21202.5
[Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
| Method | Job | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
|-------------- |---------------- |----------:|----------:|----------:|------:|------:|------:|------:|----------:|
| TransposeInto | No HwIntrinsics | 19.658 ns | 0.0550 ns | 0.0515 ns | 1.00 | - | - | - | - |
| TransposeInto | AVX | 8.613 ns | 0.0249 ns | 0.0208 ns | 0.44 | - | - | - | - |
*/

10
tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs

@ -65,17 +65,17 @@ namespace SixLabors.ImageSharp.Benchmarks
.WithId("1. No HwIntrinsics").AsBaseline());
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
if (Sse.IsSupported)
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
.WithId("2. AVX"));
.WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
.WithId("2. SSE"));
}
if (Sse.IsSupported)
if (Avx.IsSupported)
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
.WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
.WithId("3. SSE"));
.WithId("3. AVX"));
}
#endif
}

Loading…
Cancel
Save