@ -3,6 +3,7 @@
using System.Numerics ;
using System.Runtime.CompilerServices ;
using System.Runtime.InteropServices ;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics.X86 ;
#endif
@ -15,102 +16,202 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
/// </summary>
internal static partial class FastFloatingPointDCT
{
#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
private const float C_1_175876 = 1.175875602f ;
private const float C_1_961571 = - 1.961570560f ;
private const float C_0_390181 = - 0.390180644f ;
private const float C_0_899976 = - 0.899976223f ;
private const float C_2_562915 = - 2.562915447f ;
private const float C_0_298631 = 0.298631336f ;
private const float C_2_053120 = 2.053119869f ;
private const float C_3_072711 = 3.072711026f ;
private const float C_1_501321 = 1.501321110f ;
private const float C_0_541196 = 0.541196100f ;
private const float C_1_847759 = - 1.847759065f ;
private const float C_0_765367 = 0.765366865f ;
private const float C_0_125 = 0.1250f ;
#pragma warning disable SA1311, IDE1006 // naming rules violation warnings
private static readonly Vector4 mm128_F_0_7071 = new Vector4 ( 0.707106781f ) ;
private static readonly Vector4 mm128_F_0_3826 = new Vector4 ( 0.382683433f ) ;
private static readonly Vector4 mm128_F_0_5411 = new Vector4 ( 0.541196100f ) ;
private static readonly Vector4 mm128_F_1_3065 = new Vector4 ( 1.306562965f ) ;
#pragma warning restore SA1311, IDE1006
#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
private static readonly Vector4 mm128_F_0_7071 = new ( 0.707106781f ) ;
private static readonly Vector4 mm128_F_0_3826 = new ( 0.382683433f ) ;
private static readonly Vector4 mm128_F_0_5411 = new ( 0.541196100f ) ;
private static readonly Vector4 mm128_F_1_3065 = new ( 1.306562965f ) ;
private static readonly Vector4 mm128_F_1_4142 = new ( 1.414213562f ) ;
private static readonly Vector4 mm128_F_1_8477 = new ( 1.847759065f ) ;
private static readonly Vector4 mm128_F_n1_0823 = new ( - 1.082392200f ) ;
private static readonly Vector4 mm128_F_n2_6131 = new ( - 2.613125930f ) ;
#pragma warning restore SA1310, SA1311, IDE1006
/// <summary>
/// Gets reciprocal coefficients for jpeg quantization tables calculation .
/// Gets adjustment table for quantization tables.
/// </summary>
/// <remarks>
/// <para>
/// Current FDCT implementation expects its results to be multiplied by
/// a reciprocal quantization table. To get 8x8 reciprocal block values in this
/// table must be divided by quantization table values scaled with quality settings.
/// Current IDCT and FDCT implementations are based on Arai, Agui,
/// and Nakajima's algorithm. Both DCT methods does not
/// produce finished DCT output, final step is fused into the
/// quantization step. Quantization and de-quantization coefficients
/// must be multiplied by these values.
/// </para>
/// <para>
/// These values were calculates with this formula:
/// <code>
/// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
/// </code>
/// Where:
/// Given values were generated by formula:
/// <code>
/// scalefactor[row] * scalefactor[col], where
/// scalefactor[0] = 1
/// </code>
/// <code>
/// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
/// </code>
/// Values are also scaled by 8 so DCT code won't do extra division/multiplication.
/// </para>
/// </remarks>
internal static readonly float [ ] DctReciprocal AdjustmentCoefficients = new float [ ]
private static readonly float [ ] AdjustmentCoefficients = new float [ ]
{
0.125f , 0.09011998f , 0.09567086f , 0.10630376f , 0.125f , 0.15909483f , 0.23096988f , 0.45306373 f,
0.09011998f , 0.064972885f , 0.068974845f , 0.07664074f , 0.09011998f , 0.11470097f , 0.16652f , 0.32664075 f,
0.09567086f , 0.068974845f , 0.07322331f , 0.081361376f , 0.09567086f , 0.121765904f , 0.17677669f , 0.34675997 f,
0.10630376f , 0.07664074f , 0.081361376f , 0.09040392f , 0.10630376f , 0.13529903f , 0.19642374f , 0.38529903 f,
0.125f , 0.09011998f , 0.09567086f , 0.10630376f , 0.125f , 0.15909483f , 0.23096988f , 0.45306373 f,
0.15909483f , 0.11470097f , 0.121765904f , 0.13529903f , 0.15909483f , 0.2024893f , 0.2939689f , 0.5766407 f ,
0.23096988f , 0.16652f , 0.17677669f , 0.19642374f , 0.23096988f , 0.2939689f , 0.4267767f , 0.8371526 f ,
0.45306373f , 0.32664075f , 0.34675997f , 0.38529903f , 0.45306373f , 0.5766407f , 0.8371526f , 1.642134 f ,
1f , 1.3870399f , 1.306563f , 1.1758755f , 1f , 0.78569496f , 0.5411961f , 0.27589938 f,
1.3870399f , 1.9238797f , 1.812255f , 1.6309863f , 1.3870399f , 1.0897902f , 0.7506606f , 0.38268346 f,
1.306563f , 1.812255f , 1.707107f , 1.5363555f , 1.306563f , 1.02656f , 0.7071068f , 0.36047992 f,
1.1758755f , 1.6309863f , 1.5363555f , 1.3826833f , 1.1758755f , 0.9238795f , 0.63637924f , 0.32442334 f,
1f , 1.3870399f , 1.306563f , 1.1758755f , 1f , 0.78569496f , 0.5411961f , 0.27589938 f,
0.78569496f , 1.0897902f , 1.02656f , 0.9238795f , 0.78569496f , 0.61731654f , 0.42521507f , 0.21677275 f ,
0.5411961f , 0.7506606f , 0.7071068f , 0.63637924f , 0.5411961f , 0.42521507f , 0.29289323f , 0.14931567 f ,
0.27589938f , 0.38268346f , 0.36047992f , 0.32442334f , 0.27589938f , 0.21677275f , 0.14931567f , 0.076120466 f ,
} ;
/// <summary>
/// Adjusts given quantization table to be complient with FDCT implementation.
/// Adjusts given quantization table for usage with <see cref="TransformIDCT"/>.
/// </summary>
/// <param name="quantTable">Quantization table to adjust.</param>
public static void AdjustToIDCT ( ref Block8x8F quantTable )
{
ref float tableRef = ref Unsafe . As < Block8x8F , float > ( ref quantTable ) ;
ref float multipliersRef = ref MemoryMarshal . GetReference < float > ( AdjustmentCoefficients ) ;
for ( nint i = 0 ; i < Block8x8F . Size ; i + + )
{
tableRef = 0.125f * tableRef * Unsafe . Add ( ref multipliersRef , i ) ;
tableRef = ref Unsafe . Add ( ref tableRef , 1 ) ;
}
// Spectral macroblocks are transposed before quantization
// so we must transpose quantization table
quantTable . TransposeInplace ( ) ;
}
/// <summary>
/// Adjusts given quantization table for usage with <see cref="TransformFDCT"/>.
/// </summary>
/// <param name="quantTable">Quantization table to adjust.</param>
public static void AdjustToFDCT ( ref Block8x8F quantTable )
{
ref float tableRef = ref Unsafe . As < Block8x8F , float > ( ref quantTable ) ;
ref float multipliersRef = ref MemoryMarshal . GetReference < float > ( AdjustmentCoefficients ) ;
for ( nint i = 0 ; i < Block8x8F . Size ; i + + )
{
tableRef = 0.125f / ( tableRef * Unsafe . Add ( ref multipliersRef , i ) ) ;
tableRef = ref Unsafe . Add ( ref tableRef , 1 ) ;
}
}
/// <summary>
/// Apply 2D floating point IDCT inplace.
/// </summary>
/// <remarks>
/// See <see cref="DctReciprocalAdjustmentCoefficients"/> docs for explanation.
/// Input block must be dequantized before this method with table
/// adjusted by <see cref="AdjustToIDCT"/>.
/// </remarks>
/// <param name="quantizationtable">Quantization table to adjust.</param>
public static void AdjustToFDCT ( ref Block8x8F quantizationtable )
/// <param name="block">Input block .</param>
public static void TransformI DCT( ref Block8x8F block )
{
for ( int i = 0 ; i < Block8x8F . Size ; i + + )
#if SUPPORTS_RUNTIME_INTRINSICS
if ( Avx . IsSupported )
{
quantizationtable [ i ] = DctReciprocalAdjustmentCoefficients [ i ] / quantizationtable [ i ] ;
IDCT8x8_Avx ( ref block ) ;
}
else
#endif
{
IDCT_Vector4 ( ref block ) ;
}
}
/// <summary>
/// Apply 2D floating point FDCT inplace.
/// Apply 2D floating point I DCT inplace.
/// </summary>
/// <param name="block">Input matrix.</param>
/// <remarks>
/// Input block must be quantized after this method with table adjusted
/// by <see cref="AdjustToFDCT"/>.
/// </remarks>
/// <param name="block">Input block.</param>
public static void TransformFDCT ( ref Block8x8F block )
{
#if SUPPORTS_RUNTIME_INTRINSICS
if ( Avx . IsSupported )
{
ForwardTransform_Avx ( ref block ) ;
FDCT8x8 _Avx ( ref block ) ;
}
else
#endif
if ( Vector . IsHardwareAccelerated )
{
ForwardTransform_Vector4 ( ref block ) ;
FDCT _Vector4 ( ref block ) ;
}
else
{
ForwardTransform_Scalar ( ref block ) ;
FDCT_Scalar ( ref block ) ;
}
}
/// <summary>
/// Apply floating point IDCT inplace using <see cref="Vector4"/> API.
/// </summary>
/// <param name="transposedBlock">Input block.</param>
private static void IDCT_Vector4 ( ref Block8x8F transposedBlock )
{
DebugGuard . IsTrue ( Vector . IsHardwareAccelerated , "Scalar implementation should be called for non-accelerated hardware." ) ;
// First pass - process columns
IDCT8x4_Vector4 ( ref transposedBlock . V0L ) ;
IDCT8x4_Vector4 ( ref transposedBlock . V0R ) ;
// Second pass - process rows
transposedBlock . TransposeInplace ( ) ;
IDCT8x4_Vector4 ( ref transposedBlock . V0L ) ;
IDCT8x4_Vector4 ( ref transposedBlock . V0R ) ;
// Applies 1D floating point IDCT inplace on 8x4 part of 8x8 block
static void IDCT8x4_Vector4 ( ref Vector4 vecRef )
{
// Even part
Vector4 tmp0 = Unsafe . Add ( ref vecRef , 0 * 2 ) ;
Vector4 tmp1 = Unsafe . Add ( ref vecRef , 2 * 2 ) ;
Vector4 tmp2 = Unsafe . Add ( ref vecRef , 4 * 2 ) ;
Vector4 tmp3 = Unsafe . Add ( ref vecRef , 6 * 2 ) ;
Vector4 z5 = tmp0 ;
Vector4 tmp10 = z5 + tmp2 ;
Vector4 tmp11 = z5 - tmp2 ;
Vector4 tmp13 = tmp1 + tmp3 ;
Vector4 tmp12 = ( ( tmp1 - tmp3 ) * mm128_F_1_4142 ) - tmp13 ;
tmp0 = tmp10 + tmp13 ;
tmp3 = tmp10 - tmp13 ;
tmp1 = tmp11 + tmp12 ;
tmp2 = tmp11 - tmp12 ;
// Odd part
Vector4 tmp4 = Unsafe . Add ( ref vecRef , 1 * 2 ) ;
Vector4 tmp5 = Unsafe . Add ( ref vecRef , 3 * 2 ) ;
Vector4 tmp6 = Unsafe . Add ( ref vecRef , 5 * 2 ) ;
Vector4 tmp7 = Unsafe . Add ( ref vecRef , 7 * 2 ) ;
Vector4 z13 = tmp6 + tmp5 ;
Vector4 z10 = tmp6 - tmp5 ;
Vector4 z11 = tmp4 + tmp7 ;
Vector4 z12 = tmp4 - tmp7 ;
tmp7 = z11 + z13 ;
tmp11 = ( z11 - z13 ) * mm128_F_1_4142 ;
z5 = ( z10 + z12 ) * mm128_F_1_8477 ;
tmp10 = ( z12 * mm128_F_n1_0823 ) + z5 ;
tmp12 = ( z10 * mm128_F_n2_6131 ) + z5 ;
tmp6 = tmp12 - tmp7 ;
tmp5 = tmp11 - tmp6 ;
tmp4 = tmp10 - tmp5 ;
Unsafe . Add ( ref vecRef , 0 * 2 ) = tmp0 + tmp7 ;
Unsafe . Add ( ref vecRef , 7 * 2 ) = tmp0 - tmp7 ;
Unsafe . Add ( ref vecRef , 1 * 2 ) = tmp1 + tmp6 ;
Unsafe . Add ( ref vecRef , 6 * 2 ) = tmp1 - tmp6 ;
Unsafe . Add ( ref vecRef , 2 * 2 ) = tmp2 + tmp5 ;
Unsafe . Add ( ref vecRef , 5 * 2 ) = tmp2 - tmp5 ;
Unsafe . Add ( ref vecRef , 3 * 2 ) = tmp3 + tmp4 ;
Unsafe . Add ( ref vecRef , 4 * 2 ) = tmp3 - tmp4 ;
}
}
@ -120,8 +221,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
/// <remarks>
/// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
/// </remarks>
/// <param name="block">Input matrix .</param>
private static void ForwardTransform _Scalar ( ref Block8x8F block )
/// <param name="block">Input block .</param>
private static void FDCT _Scalar ( ref Block8x8F block )
{
const int dctSize = 8 ;
@ -130,17 +231,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
float z1 , z2 , z3 , z4 , z5 , z11 , z13 ;
// First pass - process rows
ref float data Ref = ref Unsafe . As < Block8x8F , float > ( ref block ) ;
ref float block Ref = ref Unsafe . As < Block8x8F , float > ( ref block ) ;
for ( int ctr = 7 ; ctr > = 0 ; ctr - - )
{
tmp0 = Unsafe . Add ( ref data Ref, 0 ) + Unsafe . Add ( ref data Ref, 7 ) ;
tmp7 = Unsafe . Add ( ref data Ref, 0 ) - Unsafe . Add ( ref data Ref, 7 ) ;
tmp1 = Unsafe . Add ( ref data Ref, 1 ) + Unsafe . Add ( ref data Ref, 6 ) ;
tmp6 = Unsafe . Add ( ref data Ref, 1 ) - Unsafe . Add ( ref data Ref, 6 ) ;
tmp2 = Unsafe . Add ( ref data Ref, 2 ) + Unsafe . Add ( ref data Ref, 5 ) ;
tmp5 = Unsafe . Add ( ref data Ref, 2 ) - Unsafe . Add ( ref data Ref, 5 ) ;
tmp3 = Unsafe . Add ( ref data Ref, 3 ) + Unsafe . Add ( ref data Ref, 4 ) ;
tmp4 = Unsafe . Add ( ref data Ref, 3 ) - Unsafe . Add ( ref data Ref, 4 ) ;
tmp0 = Unsafe . Add ( ref block Ref, 0 ) + Unsafe . Add ( ref block Ref, 7 ) ;
tmp7 = Unsafe . Add ( ref block Ref, 0 ) - Unsafe . Add ( ref block Ref, 7 ) ;
tmp1 = Unsafe . Add ( ref block Ref, 1 ) + Unsafe . Add ( ref block Ref, 6 ) ;
tmp6 = Unsafe . Add ( ref block Ref, 1 ) - Unsafe . Add ( ref block Ref, 6 ) ;
tmp2 = Unsafe . Add ( ref block Ref, 2 ) + Unsafe . Add ( ref block Ref, 5 ) ;
tmp5 = Unsafe . Add ( ref block Ref, 2 ) - Unsafe . Add ( ref block Ref, 5 ) ;
tmp3 = Unsafe . Add ( ref block Ref, 3 ) + Unsafe . Add ( ref block Ref, 4 ) ;
tmp4 = Unsafe . Add ( ref block Ref, 3 ) - Unsafe . Add ( ref block Ref, 4 ) ;
// Even part
tmp10 = tmp0 + tmp3 ;
@ -148,12 +249,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
tmp11 = tmp1 + tmp2 ;
tmp12 = tmp1 - tmp2 ;
Unsafe . Add ( ref data Ref, 0 ) = tmp10 + tmp11 ;
Unsafe . Add ( ref data Ref, 4 ) = tmp10 - tmp11 ;
Unsafe . Add ( ref block Ref, 0 ) = tmp10 + tmp11 ;
Unsafe . Add ( ref block Ref, 4 ) = tmp10 - tmp11 ;
z1 = ( tmp12 + tmp13 ) * 0.707106781f ;
Unsafe . Add ( ref data Ref, 2 ) = tmp13 + z1 ;
Unsafe . Add ( ref data Ref, 6 ) = tmp13 - z1 ;
Unsafe . Add ( ref block Ref, 2 ) = tmp13 + z1 ;
Unsafe . Add ( ref block Ref, 6 ) = tmp13 - z1 ;
// Odd part
tmp10 = tmp4 + tmp5 ;
@ -168,26 +269,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
z11 = tmp7 + z3 ;
z13 = tmp7 - z3 ;
Unsafe . Add ( ref data Ref, 5 ) = z13 + z2 ;
Unsafe . Add ( ref data Ref, 3 ) = z13 - z2 ;
Unsafe . Add ( ref data Ref, 1 ) = z11 + z4 ;
Unsafe . Add ( ref data Ref, 7 ) = z11 - z4 ;
Unsafe . Add ( ref block Ref, 5 ) = z13 + z2 ;
Unsafe . Add ( ref block Ref, 3 ) = z13 - z2 ;
Unsafe . Add ( ref block Ref, 1 ) = z11 + z4 ;
Unsafe . Add ( ref block Ref, 7 ) = z11 - z4 ;
data Ref = ref Unsafe . Add ( ref data Ref, dctSize ) ;
block Ref = ref Unsafe . Add ( ref block Ref, dctSize ) ;
}
// Second pass - process columns
data Ref = ref Unsafe . As < Block8x8F , float > ( ref block ) ;
block Ref = ref Unsafe . As < Block8x8F , float > ( ref block ) ;
for ( int ctr = 7 ; ctr > = 0 ; ctr - - )
{
tmp0 = Unsafe . Add ( ref data Ref, dctSize * 0 ) + Unsafe . Add ( ref data Ref, dctSize * 7 ) ;
tmp7 = Unsafe . Add ( ref data Ref, dctSize * 0 ) - Unsafe . Add ( ref data Ref, dctSize * 7 ) ;
tmp1 = Unsafe . Add ( ref data Ref, dctSize * 1 ) + Unsafe . Add ( ref data Ref, dctSize * 6 ) ;
tmp6 = Unsafe . Add ( ref data Ref, dctSize * 1 ) - Unsafe . Add ( ref data Ref, dctSize * 6 ) ;
tmp2 = Unsafe . Add ( ref data Ref, dctSize * 2 ) + Unsafe . Add ( ref data Ref, dctSize * 5 ) ;
tmp5 = Unsafe . Add ( ref data Ref, dctSize * 2 ) - Unsafe . Add ( ref data Ref, dctSize * 5 ) ;
tmp3 = Unsafe . Add ( ref data Ref, dctSize * 3 ) + Unsafe . Add ( ref data Ref, dctSize * 4 ) ;
tmp4 = Unsafe . Add ( ref data Ref, dctSize * 3 ) - Unsafe . Add ( ref data Ref, dctSize * 4 ) ;
tmp0 = Unsafe . Add ( ref block Ref, dctSize * 0 ) + Unsafe . Add ( ref block Ref, dctSize * 7 ) ;
tmp7 = Unsafe . Add ( ref block Ref, dctSize * 0 ) - Unsafe . Add ( ref block Ref, dctSize * 7 ) ;
tmp1 = Unsafe . Add ( ref block Ref, dctSize * 1 ) + Unsafe . Add ( ref block Ref, dctSize * 6 ) ;
tmp6 = Unsafe . Add ( ref block Ref, dctSize * 1 ) - Unsafe . Add ( ref block Ref, dctSize * 6 ) ;
tmp2 = Unsafe . Add ( ref block Ref, dctSize * 2 ) + Unsafe . Add ( ref block Ref, dctSize * 5 ) ;
tmp5 = Unsafe . Add ( ref block Ref, dctSize * 2 ) - Unsafe . Add ( ref block Ref, dctSize * 5 ) ;
tmp3 = Unsafe . Add ( ref block Ref, dctSize * 3 ) + Unsafe . Add ( ref block Ref, dctSize * 4 ) ;
tmp4 = Unsafe . Add ( ref block Ref, dctSize * 3 ) - Unsafe . Add ( ref block Ref, dctSize * 4 ) ;
// Even part
tmp10 = tmp0 + tmp3 ;
@ -195,12 +296,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
tmp11 = tmp1 + tmp2 ;
tmp12 = tmp1 - tmp2 ;
Unsafe . Add ( ref data Ref, dctSize * 0 ) = tmp10 + tmp11 ;
Unsafe . Add ( ref data Ref, dctSize * 4 ) = tmp10 - tmp11 ;
Unsafe . Add ( ref block Ref, dctSize * 0 ) = tmp10 + tmp11 ;
Unsafe . Add ( ref block Ref, dctSize * 4 ) = tmp10 - tmp11 ;
z1 = ( tmp12 + tmp13 ) * 0.707106781f ;
Unsafe . Add ( ref data Ref, dctSize * 2 ) = tmp13 + z1 ;
Unsafe . Add ( ref data Ref, dctSize * 6 ) = tmp13 - z1 ;
Unsafe . Add ( ref block Ref, dctSize * 2 ) = tmp13 + z1 ;
Unsafe . Add ( ref block Ref, dctSize * 6 ) = tmp13 - z1 ;
// Odd part
tmp10 = tmp4 + tmp5 ;
@ -215,12 +316,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
z11 = tmp7 + z3 ;
z13 = tmp7 - z3 ;
Unsafe . Add ( ref data Ref, dctSize * 5 ) = z13 + z2 ;
Unsafe . Add ( ref data Ref, dctSize * 3 ) = z13 - z2 ;
Unsafe . Add ( ref data Ref, dctSize * 1 ) = z11 + z4 ;
Unsafe . Add ( ref data Ref, dctSize * 7 ) = z11 - z4 ;
Unsafe . Add ( ref block Ref, dctSize * 5 ) = z13 + z2 ;
Unsafe . Add ( ref block Ref, dctSize * 3 ) = z13 - z2 ;
Unsafe . Add ( ref block Ref, dctSize * 1 ) = z11 + z4 ;
Unsafe . Add ( ref block Ref, dctSize * 7 ) = z11 - z4 ;
data Ref = ref Unsafe . Add ( ref data Ref, 1 ) ;
block Ref = ref Unsafe . Add ( ref block Ref, 1 ) ;
}
}
@ -230,11 +331,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
/// <remarks>
/// This implementation must be called only if hardware supports 4
/// floating point numbers vector. Otherwise explicit scalar
/// implementation <see cref="ForwardTransform _Scalar"/> is faster
/// because it does not rely on matrix transposition.
/// implementation <see cref="FDCT _Scalar"/> is faster
/// because it does not rely on block transposition.
/// </remarks>
/// <param name="block">Input matrix .</param>
private static void ForwardTransform _Vector4 ( ref Block8x8F block )
/// <param name="block">Input block .</param>
public static void FDCT _Vector4 ( ref Block8x8F block )
{
DebugGuard . IsTrue ( Vector . IsHardwareAccelerated , "Scalar implementation should be called for non-accelerated hardware." ) ;
@ -247,209 +348,50 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
block . TransposeInplace ( ) ;
FDCT8x4_Vector4 ( ref block . V0L ) ;
FDCT8x4_Vector4 ( ref block . V0R ) ;
}
/// <summary>
/// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix.
/// </summary>
/// <remarks>
/// Implemented using Vector4 API operations for either scalar or sse hardware implementation.
/// Must be called on both 8x4 matrix parts for the full FDCT transform.
/// </remarks>
/// <param name="blockRef">Input reference to the first </param>
private static void FDCT8x4_Vector4 ( ref Vector4 blockRef )
{
Vector4 tmp0 = Unsafe . Add ( ref blockRef , 0 ) + Unsafe . Add ( ref blockRef , 1 4 ) ;
Vector4 tmp7 = Unsafe . Add ( ref blockRef , 0 ) - Unsafe . Add ( ref blockRef , 1 4 ) ;
Vector4 tmp1 = Unsafe . Add ( ref blockRef , 2 ) + Unsafe . Add ( ref blockRef , 1 2 ) ;
Vector4 tmp6 = Unsafe . Add ( ref blockRef , 2 ) - Unsafe . Add ( ref blockRef , 1 2 ) ;
Vector4 tmp2 = Unsafe . Add ( ref blockRef , 4 ) + Unsafe . Add ( ref blockRef , 1 0 ) ;
Vector4 tmp5 = Unsafe . Add ( ref blockRef , 4 ) - Unsafe . Add ( ref blockRef , 1 0 ) ;
Vector4 tmp3 = Unsafe . Add ( ref blockRef , 6 ) + Unsafe . Add ( ref blockRef , 8 ) ;
Vector4 tmp4 = Unsafe . Add ( ref blockRef , 6 ) - Unsafe . Add ( ref blockRef , 8 ) ;
// Even part
Vector4 tmp10 = tmp0 + tmp3 ;
Vector4 tmp13 = tmp0 - tmp3 ;
Vector4 tmp11 = tmp1 + tmp2 ;
Vector4 tmp12 = tmp1 - tmp2 ;
Unsafe . Add ( ref blockRef , 0 ) = tmp10 + tmp11 ;
Unsafe . Add ( ref blockRef , 8 ) = tmp10 - tmp11 ;
Vector4 z1 = ( tmp12 + tmp13 ) * mm128_F_0_7071 ;
Unsafe . Add ( ref blockRef , 4 ) = tmp13 + z1 ;
Unsafe . Add ( ref blockRef , 1 2 ) = tmp13 - z1 ;
// Odd part
tmp10 = tmp4 + tmp5 ;
tmp11 = tmp5 + tmp6 ;
tmp12 = tmp6 + tmp7 ;
Vector4 z5 = ( tmp10 - tmp12 ) * mm128_F_0_3826 ;
Vector4 z2 = ( mm128_F_0_5411 * tmp10 ) + z5 ;
Vector4 z4 = ( mm128_F_1_3065 * tmp12 ) + z5 ;
Vector4 z3 = tmp11 * mm128_F_0_7071 ;
Vector4 z11 = tmp7 + z3 ;
Vector4 z13 = tmp7 - z3 ;
Unsafe . Add ( ref blockRef , 1 0 ) = z13 + z2 ;
Unsafe . Add ( ref blockRef , 6 ) = z13 - z2 ;
Unsafe . Add ( ref blockRef , 2 ) = z11 + z4 ;
Unsafe . Add ( ref blockRef , 1 4 ) = z11 - z4 ;
}
// Applies 1D floating point FDCT inplace on 8x4 part of 8x8 block
static void FDCT8x4_Vector4 ( ref Vector4 vecRef )
{
Vector4 tmp0 = Unsafe . Add ( ref vecRef , 0 ) + Unsafe . Add ( ref vecRef , 1 4 ) ;
Vector4 tmp7 = Unsafe . Add ( ref vecRef , 0 ) - Unsafe . Add ( ref vecRef , 1 4 ) ;
Vector4 tmp1 = Unsafe . Add ( ref vecRef , 2 ) + Unsafe . Add ( ref vecRef , 1 2 ) ;
Vector4 tmp6 = Unsafe . Add ( ref vecRef , 2 ) - Unsafe . Add ( ref vecRef , 1 2 ) ;
Vector4 tmp2 = Unsafe . Add ( ref vecRef , 4 ) + Unsafe . Add ( ref vecRef , 1 0 ) ;
Vector4 tmp5 = Unsafe . Add ( ref vecRef , 4 ) - Unsafe . Add ( ref vecRef , 1 0 ) ;
Vector4 tmp3 = Unsafe . Add ( ref vecRef , 6 ) + Unsafe . Add ( ref vecRef , 8 ) ;
Vector4 tmp4 = Unsafe . Add ( ref vecRef , 6 ) - Unsafe . Add ( ref vecRef , 8 ) ;
/// <summary>
/// Apply floating point IDCT inplace.
/// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
/// </summary>
/// <param name="block">Input matrix.</param>
/// <param name="temp">Matrix to store temporal results.</param>
public static void TransformIDCT ( ref Block8x8F block , ref Block8x8F temp )
{
block . TransposeInplace ( ) ;
IDCT8x8 ( ref block , ref temp ) ;
temp . TransposeInplace ( ) ;
IDCT8x8 ( ref temp , ref block ) ;
// Even part
Vector4 tmp10 = tmp0 + tmp3 ;
Vector4 tmp13 = tmp0 - tmp3 ;
Vector4 tmp11 = tmp1 + tmp2 ;
Vector4 tmp12 = tmp1 - tmp2 ;
// TODO: This can be fused into quantization table step
block . MultiplyInPlace ( C_0_125 ) ;
}
Unsafe . Add ( ref vecRef , 0 ) = tmp10 + tmp11 ;
Unsafe . Add ( ref vecRef , 8 ) = tmp10 - tmp11 ;
/// <summary>
/// Performs 8x8 matrix Inverse Discrete Cosine Transform
/// </summary>
/// <param name="s">Source</param>
/// <param name="d">Destination</param>
private static void IDCT8x8 ( ref Block8x8F s , ref Block8x8F d )
{
#if SUPPORTS_RUNTIME_INTRINSICS
if ( Avx . IsSupported )
{
IDCT8x8_Avx ( ref s , ref d ) ;
}
else
#endif
{
IDCT8x4_LeftPart ( ref s , ref d ) ;
IDCT8x4_RightPart ( ref s , ref d ) ;
}
}
Vector4 z1 = ( tmp12 + tmp13 ) * mm128_F_0_7071 ;
Unsafe . Add ( ref vecRef , 4 ) = tmp13 + z1 ;
Unsafe . Add ( ref vecRef , 1 2 ) = tmp13 - z1 ;
/// <summary>
/// Do IDCT internal operations on the left part of the block. Original src:
/// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
/// </summary>
/// <param name="s">The source block</param>
/// <param name="d">Destination block</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void IDCT8x4_LeftPart ( ref Block8x8F s , ref Block8x8F d )
{
Vector4 my1 = s . V1L ;
Vector4 my7 = s . V7L ;
Vector4 mz0 = my1 + my7 ;
Vector4 my3 = s . V3L ;
Vector4 mz2 = my3 + my7 ;
Vector4 my5 = s . V5L ;
Vector4 mz1 = my3 + my5 ;
Vector4 mz3 = my1 + my5 ;
Vector4 mz4 = ( mz0 + mz1 ) * C_1_175876 ;
mz2 = ( mz2 * C_1_961571 ) + mz4 ;
mz3 = ( mz3 * C_0_390181 ) + mz4 ;
mz0 = mz0 * C_0_899976 ;
mz1 = mz1 * C_2_562915 ;
Vector4 mb3 = ( my7 * C_0_298631 ) + mz0 + mz2 ;
Vector4 mb2 = ( my5 * C_2_053120 ) + mz1 + mz3 ;
Vector4 mb1 = ( my3 * C_3_072711 ) + mz1 + mz2 ;
Vector4 mb0 = ( my1 * C_1_501321 ) + mz0 + mz3 ;
Vector4 my2 = s . V2L ;
Vector4 my6 = s . V6L ;
mz4 = ( my2 + my6 ) * C_0_541196 ;
Vector4 my0 = s . V0L ;
Vector4 my4 = s . V4L ;
mz0 = my0 + my4 ;
mz1 = my0 - my4 ;
mz2 = mz4 + ( my6 * C_1_847759 ) ;
mz3 = mz4 + ( my2 * C_0_765367 ) ;
my0 = mz0 + mz3 ;
my3 = mz0 - mz3 ;
my1 = mz1 + mz2 ;
my2 = mz1 - mz2 ;
d . V0L = my0 + mb0 ;
d . V7L = my0 - mb0 ;
d . V1L = my1 + mb1 ;
d . V6L = my1 - mb1 ;
d . V2L = my2 + mb2 ;
d . V5L = my2 - mb2 ;
d . V3L = my3 + mb3 ;
d . V4L = my3 - mb3 ;
}
// Odd part
tmp10 = tmp4 + tmp5 ;
tmp11 = tmp5 + tmp6 ;
tmp12 = tmp6 + tmp7 ;
/// <summary>
/// Do IDCT internal operations on the right part of the block.
/// Original src:
/// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
/// </summary>
/// <param name="s">The source block</param>
/// <param name="d">The destination block</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void IDCT8x4_RightPart ( ref Block8x8F s , ref Block8x8F d )
{
Vector4 my1 = s . V1R ;
Vector4 my7 = s . V7R ;
Vector4 mz0 = my1 + my7 ;
Vector4 my3 = s . V3R ;
Vector4 mz2 = my3 + my7 ;
Vector4 my5 = s . V5R ;
Vector4 mz1 = my3 + my5 ;
Vector4 mz3 = my1 + my5 ;
Vector4 mz4 = ( mz0 + mz1 ) * C_1_175876 ;
mz2 = ( mz2 * C_1_961571 ) + mz4 ;
mz3 = ( mz3 * C_0_390181 ) + mz4 ;
mz0 = mz0 * C_0_899976 ;
mz1 = mz1 * C_2_562915 ;
Vector4 mb3 = ( my7 * C_0_298631 ) + mz0 + mz2 ;
Vector4 mb2 = ( my5 * C_2_053120 ) + mz1 + mz3 ;
Vector4 mb1 = ( my3 * C_3_072711 ) + mz1 + mz2 ;
Vector4 mb0 = ( my1 * C_1_501321 ) + mz0 + mz3 ;
Vector4 my2 = s . V2R ;
Vector4 my6 = s . V6R ;
mz4 = ( my2 + my6 ) * C_0_541196 ;
Vector4 my0 = s . V0R ;
Vector4 my4 = s . V4R ;
mz0 = my0 + my4 ;
mz1 = my0 - my4 ;
mz2 = mz4 + ( my6 * C_1_847759 ) ;
mz3 = mz4 + ( my2 * C_0_765367 ) ;
my0 = mz0 + mz3 ;
my3 = mz0 - mz3 ;
my1 = mz1 + mz2 ;
my2 = mz1 - mz2 ;
d . V0R = my0 + mb0 ;
d . V7R = my0 - mb0 ;
d . V1R = my1 + mb1 ;
d . V6R = my1 - mb1 ;
d . V2R = my2 + mb2 ;
d . V5R = my2 - mb2 ;
d . V3R = my3 + mb3 ;
d . V4R = my3 - mb3 ;
Vector4 z5 = ( tmp10 - tmp12 ) * mm128_F_0_3826 ;
Vector4 z2 = ( mm128_F_0_5411 * tmp10 ) + z5 ;
Vector4 z4 = ( mm128_F_1_3065 * tmp12 ) + z5 ;
Vector4 z3 = tmp11 * mm128_F_0_7071 ;
Vector4 z11 = tmp7 + z3 ;
Vector4 z13 = tmp7 - z3 ;
Unsafe . Add ( ref vecRef , 1 0 ) = z13 + z2 ;
Unsafe . Add ( ref vecRef , 6 ) = z13 - z2 ;
Unsafe . Add ( ref vecRef , 2 ) = z11 + z4 ;
Unsafe . Add ( ref vecRef , 1 4 ) = z11 - z4 ;
}
}
}
}