|
|
|
@ -3,6 +3,7 @@ |
|
|
|
|
|
|
|
using System.Numerics; |
|
|
|
using System.Runtime.CompilerServices; |
|
|
|
using System.Runtime.InteropServices; |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
using System.Runtime.Intrinsics.X86; |
|
|
|
#endif
|
|
|
|
@ -15,102 +16,202 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|
|
|
/// </summary>
|
|
|
|
internal static partial class FastFloatingPointDCT |
|
|
|
{ |
|
|
|
#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
|
|
|
|
private const float C_1_175876 = 1.175875602f; |
|
|
|
private const float C_1_961571 = -1.961570560f; |
|
|
|
private const float C_0_390181 = -0.390180644f; |
|
|
|
private const float C_0_899976 = -0.899976223f; |
|
|
|
private const float C_2_562915 = -2.562915447f; |
|
|
|
private const float C_0_298631 = 0.298631336f; |
|
|
|
private const float C_2_053120 = 2.053119869f; |
|
|
|
private const float C_3_072711 = 3.072711026f; |
|
|
|
private const float C_1_501321 = 1.501321110f; |
|
|
|
private const float C_0_541196 = 0.541196100f; |
|
|
|
private const float C_1_847759 = -1.847759065f; |
|
|
|
private const float C_0_765367 = 0.765366865f; |
|
|
|
|
|
|
|
private const float C_0_125 = 0.1250f; |
|
|
|
|
|
|
|
#pragma warning disable SA1311, IDE1006 // naming rules violation warnings
|
|
|
|
private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f); |
|
|
|
private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f); |
|
|
|
private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f); |
|
|
|
private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f); |
|
|
|
#pragma warning restore SA1311, IDE1006
|
|
|
|
|
|
|
|
#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
|
|
|
|
#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
|
|
|
|
private static readonly Vector4 mm128_F_0_7071 = new(0.707106781f); |
|
|
|
private static readonly Vector4 mm128_F_0_3826 = new(0.382683433f); |
|
|
|
private static readonly Vector4 mm128_F_0_5411 = new(0.541196100f); |
|
|
|
private static readonly Vector4 mm128_F_1_3065 = new(1.306562965f); |
|
|
|
|
|
|
|
private static readonly Vector4 mm128_F_1_4142 = new(1.414213562f); |
|
|
|
private static readonly Vector4 mm128_F_1_8477 = new(1.847759065f); |
|
|
|
private static readonly Vector4 mm128_F_n1_0823 = new(-1.082392200f); |
|
|
|
private static readonly Vector4 mm128_F_n2_6131 = new(-2.613125930f); |
|
|
|
#pragma warning restore SA1310, SA1311, IDE1006
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Gets reciprocal coefficients for jpeg quantization tables calculation.
|
|
|
|
/// Gets adjustment table for quantization tables.
|
|
|
|
/// </summary>
|
|
|
|
/// <remarks>
|
|
|
|
/// <para>
|
|
|
|
/// Current FDCT implementation expects its results to be multiplied by
|
|
|
|
/// a reciprocal quantization table. To get 8x8 reciprocal block values in this
|
|
|
|
/// table must be divided by quantization table values scaled with quality settings.
|
|
|
|
/// Current IDCT and FDCT implementations are based on Arai, Agui,
|
|
|
|
/// and Nakajima's algorithm. Both DCT methods does not
|
|
|
|
/// produce finished DCT output, final step is fused into the
|
|
|
|
/// quantization step. Quantization and de-quantization coefficients
|
|
|
|
/// must be multiplied by these values.
|
|
|
|
/// </para>
|
|
|
|
/// <para>
|
|
|
|
/// These values were calculates with this formula:
|
|
|
|
/// <code>
|
|
|
|
/// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
|
|
|
|
/// </code>
|
|
|
|
/// Where:
|
|
|
|
/// Given values were generated by formula:
|
|
|
|
/// <code>
|
|
|
|
/// scalefactor[row] * scalefactor[col], where
|
|
|
|
/// scalefactor[0] = 1
|
|
|
|
/// </code>
|
|
|
|
/// <code>
|
|
|
|
/// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
|
|
|
|
/// </code>
|
|
|
|
/// Values are also scaled by 8 so DCT code won't do extra division/multiplication.
|
|
|
|
/// </para>
|
|
|
|
/// </remarks>
|
|
|
|
internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[] |
|
|
|
private static readonly float[] AdjustmentCoefficients = new float[] |
|
|
|
{ |
|
|
|
0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, |
|
|
|
0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, |
|
|
|
0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, |
|
|
|
0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, |
|
|
|
0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, |
|
|
|
0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, |
|
|
|
0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, |
|
|
|
0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, |
|
|
|
1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f, |
|
|
|
1.3870399f, 1.9238797f, 1.812255f, 1.6309863f, 1.3870399f, 1.0897902f, 0.7506606f, 0.38268346f, |
|
|
|
1.306563f, 1.812255f, 1.707107f, 1.5363555f, 1.306563f, 1.02656f, 0.7071068f, 0.36047992f, |
|
|
|
1.1758755f, 1.6309863f, 1.5363555f, 1.3826833f, 1.1758755f, 0.9238795f, 0.63637924f, 0.32442334f, |
|
|
|
1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f, |
|
|
|
0.78569496f, 1.0897902f, 1.02656f, 0.9238795f, 0.78569496f, 0.61731654f, 0.42521507f, 0.21677275f, |
|
|
|
0.5411961f, 0.7506606f, 0.7071068f, 0.63637924f, 0.5411961f, 0.42521507f, 0.29289323f, 0.14931567f, |
|
|
|
0.27589938f, 0.38268346f, 0.36047992f, 0.32442334f, 0.27589938f, 0.21677275f, 0.14931567f, 0.076120466f, |
|
|
|
}; |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Adjusts given quantization table to be complient with FDCT implementation.
|
|
|
|
/// Adjusts given quantization table for usage with <see cref="TransformIDCT"/>.
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="quantTable">Quantization table to adjust.</param>
|
|
|
|
public static void AdjustToIDCT(ref Block8x8F quantTable) |
|
|
|
{ |
|
|
|
ref float tableRef = ref Unsafe.As<Block8x8F, float>(ref quantTable); |
|
|
|
ref float multipliersRef = ref MemoryMarshal.GetReference<float>(AdjustmentCoefficients); |
|
|
|
for (nint i = 0; i < Block8x8F.Size; i++) |
|
|
|
{ |
|
|
|
tableRef = 0.125f * tableRef * Unsafe.Add(ref multipliersRef, i); |
|
|
|
tableRef = ref Unsafe.Add(ref tableRef, 1); |
|
|
|
} |
|
|
|
|
|
|
|
// Spectral macroblocks are transposed before quantization
|
|
|
|
// so we must transpose quantization table
|
|
|
|
quantTable.TransposeInplace(); |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Adjusts given quantization table for usage with <see cref="TransformFDCT"/>.
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="quantTable">Quantization table to adjust.</param>
|
|
|
|
public static void AdjustToFDCT(ref Block8x8F quantTable) |
|
|
|
{ |
|
|
|
ref float tableRef = ref Unsafe.As<Block8x8F, float>(ref quantTable); |
|
|
|
ref float multipliersRef = ref MemoryMarshal.GetReference<float>(AdjustmentCoefficients); |
|
|
|
for (nint i = 0; i < Block8x8F.Size; i++) |
|
|
|
{ |
|
|
|
tableRef = 0.125f / (tableRef * Unsafe.Add(ref multipliersRef, i)); |
|
|
|
tableRef = ref Unsafe.Add(ref tableRef, 1); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Apply 2D floating point IDCT inplace.
|
|
|
|
/// </summary>
|
|
|
|
/// <remarks>
|
|
|
|
/// See <see cref="DctReciprocalAdjustmentCoefficients"/> docs for explanation.
|
|
|
|
/// Input block must be dequantized before this method with table
|
|
|
|
/// adjusted by <see cref="AdjustToIDCT"/>.
|
|
|
|
/// </remarks>
|
|
|
|
/// <param name="quantizationtable">Quantization table to adjust.</param>
|
|
|
|
public static void AdjustToFDCT(ref Block8x8F quantizationtable) |
|
|
|
/// <param name="block">Input block.</param>
|
|
|
|
public static void TransformIDCT(ref Block8x8F block) |
|
|
|
{ |
|
|
|
for (int i = 0; i < Block8x8F.Size; i++) |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
if (Avx.IsSupported) |
|
|
|
{ |
|
|
|
quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i]; |
|
|
|
IDCT8x8_Avx(ref block); |
|
|
|
} |
|
|
|
else |
|
|
|
#endif
|
|
|
|
{ |
|
|
|
IDCT_Vector4(ref block); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Apply 2D floating point FDCT inplace.
|
|
|
|
/// Apply 2D floating point IDCT inplace.
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="block">Input matrix.</param>
|
|
|
|
/// <remarks>
|
|
|
|
/// Input block must be quantized after this method with table adjusted
|
|
|
|
/// by <see cref="AdjustToFDCT"/>.
|
|
|
|
/// </remarks>
|
|
|
|
/// <param name="block">Input block.</param>
|
|
|
|
public static void TransformFDCT(ref Block8x8F block) |
|
|
|
{ |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
if (Avx.IsSupported) |
|
|
|
{ |
|
|
|
ForwardTransform_Avx(ref block); |
|
|
|
FDCT8x8_Avx(ref block); |
|
|
|
} |
|
|
|
else |
|
|
|
#endif
|
|
|
|
if (Vector.IsHardwareAccelerated) |
|
|
|
{ |
|
|
|
ForwardTransform_Vector4(ref block); |
|
|
|
FDCT_Vector4(ref block); |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
ForwardTransform_Scalar(ref block); |
|
|
|
FDCT_Scalar(ref block); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Apply floating point IDCT inplace using <see cref="Vector4"/> API.
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="transposedBlock">Input block.</param>
|
|
|
|
private static void IDCT_Vector4(ref Block8x8F transposedBlock) |
|
|
|
{ |
|
|
|
DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware."); |
|
|
|
|
|
|
|
// First pass - process columns
|
|
|
|
IDCT8x4_Vector4(ref transposedBlock.V0L); |
|
|
|
IDCT8x4_Vector4(ref transposedBlock.V0R); |
|
|
|
|
|
|
|
// Second pass - process rows
|
|
|
|
transposedBlock.TransposeInplace(); |
|
|
|
IDCT8x4_Vector4(ref transposedBlock.V0L); |
|
|
|
IDCT8x4_Vector4(ref transposedBlock.V0R); |
|
|
|
|
|
|
|
// Applies 1D floating point IDCT inplace on 8x4 part of 8x8 block
|
|
|
|
static void IDCT8x4_Vector4(ref Vector4 vecRef) |
|
|
|
{ |
|
|
|
// Even part
|
|
|
|
Vector4 tmp0 = Unsafe.Add(ref vecRef, 0 * 2); |
|
|
|
Vector4 tmp1 = Unsafe.Add(ref vecRef, 2 * 2); |
|
|
|
Vector4 tmp2 = Unsafe.Add(ref vecRef, 4 * 2); |
|
|
|
Vector4 tmp3 = Unsafe.Add(ref vecRef, 6 * 2); |
|
|
|
|
|
|
|
Vector4 z5 = tmp0; |
|
|
|
Vector4 tmp10 = z5 + tmp2; |
|
|
|
Vector4 tmp11 = z5 - tmp2; |
|
|
|
|
|
|
|
Vector4 tmp13 = tmp1 + tmp3; |
|
|
|
Vector4 tmp12 = ((tmp1 - tmp3) * mm128_F_1_4142) - tmp13; |
|
|
|
|
|
|
|
tmp0 = tmp10 + tmp13; |
|
|
|
tmp3 = tmp10 - tmp13; |
|
|
|
tmp1 = tmp11 + tmp12; |
|
|
|
tmp2 = tmp11 - tmp12; |
|
|
|
|
|
|
|
// Odd part
|
|
|
|
Vector4 tmp4 = Unsafe.Add(ref vecRef, 1 * 2); |
|
|
|
Vector4 tmp5 = Unsafe.Add(ref vecRef, 3 * 2); |
|
|
|
Vector4 tmp6 = Unsafe.Add(ref vecRef, 5 * 2); |
|
|
|
Vector4 tmp7 = Unsafe.Add(ref vecRef, 7 * 2); |
|
|
|
|
|
|
|
Vector4 z13 = tmp6 + tmp5; |
|
|
|
Vector4 z10 = tmp6 - tmp5; |
|
|
|
Vector4 z11 = tmp4 + tmp7; |
|
|
|
Vector4 z12 = tmp4 - tmp7; |
|
|
|
|
|
|
|
tmp7 = z11 + z13; |
|
|
|
tmp11 = (z11 - z13) * mm128_F_1_4142; |
|
|
|
|
|
|
|
z5 = (z10 + z12) * mm128_F_1_8477; |
|
|
|
|
|
|
|
tmp10 = (z12 * mm128_F_n1_0823) + z5; |
|
|
|
tmp12 = (z10 * mm128_F_n2_6131) + z5; |
|
|
|
|
|
|
|
tmp6 = tmp12 - tmp7; |
|
|
|
tmp5 = tmp11 - tmp6; |
|
|
|
tmp4 = tmp10 - tmp5; |
|
|
|
|
|
|
|
Unsafe.Add(ref vecRef, 0 * 2) = tmp0 + tmp7; |
|
|
|
Unsafe.Add(ref vecRef, 7 * 2) = tmp0 - tmp7; |
|
|
|
Unsafe.Add(ref vecRef, 1 * 2) = tmp1 + tmp6; |
|
|
|
Unsafe.Add(ref vecRef, 6 * 2) = tmp1 - tmp6; |
|
|
|
Unsafe.Add(ref vecRef, 2 * 2) = tmp2 + tmp5; |
|
|
|
Unsafe.Add(ref vecRef, 5 * 2) = tmp2 - tmp5; |
|
|
|
Unsafe.Add(ref vecRef, 3 * 2) = tmp3 + tmp4; |
|
|
|
Unsafe.Add(ref vecRef, 4 * 2) = tmp3 - tmp4; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
@ -120,8 +221,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|
|
|
/// <remarks>
|
|
|
|
/// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
|
|
|
|
/// </remarks>
|
|
|
|
/// <param name="block">Input matrix.</param>
|
|
|
|
private static void ForwardTransform_Scalar(ref Block8x8F block) |
|
|
|
/// <param name="block">Input block.</param>
|
|
|
|
private static void FDCT_Scalar(ref Block8x8F block) |
|
|
|
{ |
|
|
|
const int dctSize = 8; |
|
|
|
|
|
|
|
@ -130,17 +231,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|
|
|
float z1, z2, z3, z4, z5, z11, z13; |
|
|
|
|
|
|
|
// First pass - process rows
|
|
|
|
ref float dataRef = ref Unsafe.As<Block8x8F, float>(ref block); |
|
|
|
ref float blockRef = ref Unsafe.As<Block8x8F, float>(ref block); |
|
|
|
for (int ctr = 7; ctr >= 0; ctr--) |
|
|
|
{ |
|
|
|
tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7); |
|
|
|
tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7); |
|
|
|
tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6); |
|
|
|
tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6); |
|
|
|
tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5); |
|
|
|
tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5); |
|
|
|
tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4); |
|
|
|
tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4); |
|
|
|
tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7); |
|
|
|
tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7); |
|
|
|
tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6); |
|
|
|
tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6); |
|
|
|
tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5); |
|
|
|
tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5); |
|
|
|
tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4); |
|
|
|
tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4); |
|
|
|
|
|
|
|
// Even part
|
|
|
|
tmp10 = tmp0 + tmp3; |
|
|
|
@ -148,12 +249,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|
|
|
tmp11 = tmp1 + tmp2; |
|
|
|
tmp12 = tmp1 - tmp2; |
|
|
|
|
|
|
|
Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11; |
|
|
|
Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11; |
|
|
|
Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11; |
|
|
|
Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11; |
|
|
|
|
|
|
|
z1 = (tmp12 + tmp13) * 0.707106781f; |
|
|
|
Unsafe.Add(ref dataRef, 2) = tmp13 + z1; |
|
|
|
Unsafe.Add(ref dataRef, 6) = tmp13 - z1; |
|
|
|
Unsafe.Add(ref blockRef, 2) = tmp13 + z1; |
|
|
|
Unsafe.Add(ref blockRef, 6) = tmp13 - z1; |
|
|
|
|
|
|
|
// Odd part
|
|
|
|
tmp10 = tmp4 + tmp5; |
|
|
|
@ -168,26 +269,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|
|
|
z11 = tmp7 + z3; |
|
|
|
z13 = tmp7 - z3; |
|
|
|
|
|
|
|
Unsafe.Add(ref dataRef, 5) = z13 + z2; |
|
|
|
Unsafe.Add(ref dataRef, 3) = z13 - z2; |
|
|
|
Unsafe.Add(ref dataRef, 1) = z11 + z4; |
|
|
|
Unsafe.Add(ref dataRef, 7) = z11 - z4; |
|
|
|
Unsafe.Add(ref blockRef, 5) = z13 + z2; |
|
|
|
Unsafe.Add(ref blockRef, 3) = z13 - z2; |
|
|
|
Unsafe.Add(ref blockRef, 1) = z11 + z4; |
|
|
|
Unsafe.Add(ref blockRef, 7) = z11 - z4; |
|
|
|
|
|
|
|
dataRef = ref Unsafe.Add(ref dataRef, dctSize); |
|
|
|
blockRef = ref Unsafe.Add(ref blockRef, dctSize); |
|
|
|
} |
|
|
|
|
|
|
|
// Second pass - process columns
|
|
|
|
dataRef = ref Unsafe.As<Block8x8F, float>(ref block); |
|
|
|
blockRef = ref Unsafe.As<Block8x8F, float>(ref block); |
|
|
|
for (int ctr = 7; ctr >= 0; ctr--) |
|
|
|
{ |
|
|
|
tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7); |
|
|
|
tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7); |
|
|
|
tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6); |
|
|
|
tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6); |
|
|
|
tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5); |
|
|
|
tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5); |
|
|
|
tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4); |
|
|
|
tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4); |
|
|
|
tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7); |
|
|
|
tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7); |
|
|
|
tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6); |
|
|
|
tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6); |
|
|
|
tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5); |
|
|
|
tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5); |
|
|
|
tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4); |
|
|
|
tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4); |
|
|
|
|
|
|
|
// Even part
|
|
|
|
tmp10 = tmp0 + tmp3; |
|
|
|
@ -195,12 +296,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|
|
|
tmp11 = tmp1 + tmp2; |
|
|
|
tmp12 = tmp1 - tmp2; |
|
|
|
|
|
|
|
Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11; |
|
|
|
Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11; |
|
|
|
Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11; |
|
|
|
Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11; |
|
|
|
|
|
|
|
z1 = (tmp12 + tmp13) * 0.707106781f; |
|
|
|
Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1; |
|
|
|
Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1; |
|
|
|
Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1; |
|
|
|
Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1; |
|
|
|
|
|
|
|
// Odd part
|
|
|
|
tmp10 = tmp4 + tmp5; |
|
|
|
@ -215,12 +316,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|
|
|
z11 = tmp7 + z3; |
|
|
|
z13 = tmp7 - z3; |
|
|
|
|
|
|
|
Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2; |
|
|
|
Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2; |
|
|
|
Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4; |
|
|
|
Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4; |
|
|
|
Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2; |
|
|
|
Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2; |
|
|
|
Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4; |
|
|
|
Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4; |
|
|
|
|
|
|
|
dataRef = ref Unsafe.Add(ref dataRef, 1); |
|
|
|
blockRef = ref Unsafe.Add(ref blockRef, 1); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
@ -230,11 +331,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|
|
|
/// <remarks>
|
|
|
|
/// This implementation must be called only if hardware supports 4
|
|
|
|
/// floating point numbers vector. Otherwise explicit scalar
|
|
|
|
/// implementation <see cref="ForwardTransform_Scalar"/> is faster
|
|
|
|
/// because it does not rely on matrix transposition.
|
|
|
|
/// implementation <see cref="FDCT_Scalar"/> is faster
|
|
|
|
/// because it does not rely on block transposition.
|
|
|
|
/// </remarks>
|
|
|
|
/// <param name="block">Input matrix.</param>
|
|
|
|
private static void ForwardTransform_Vector4(ref Block8x8F block) |
|
|
|
/// <param name="block">Input block.</param>
|
|
|
|
public static void FDCT_Vector4(ref Block8x8F block) |
|
|
|
{ |
|
|
|
DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware."); |
|
|
|
|
|
|
|
@ -247,209 +348,50 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|
|
|
block.TransposeInplace(); |
|
|
|
FDCT8x4_Vector4(ref block.V0L); |
|
|
|
FDCT8x4_Vector4(ref block.V0R); |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix.
|
|
|
|
/// </summary>
|
|
|
|
/// <remarks>
|
|
|
|
/// Implemented using Vector4 API operations for either scalar or sse hardware implementation.
|
|
|
|
/// Must be called on both 8x4 matrix parts for the full FDCT transform.
|
|
|
|
/// </remarks>
|
|
|
|
/// <param name="blockRef">Input reference to the first </param>
|
|
|
|
private static void FDCT8x4_Vector4(ref Vector4 blockRef) |
|
|
|
{ |
|
|
|
Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14); |
|
|
|
Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14); |
|
|
|
Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12); |
|
|
|
Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12); |
|
|
|
Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10); |
|
|
|
Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10); |
|
|
|
Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8); |
|
|
|
Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8); |
|
|
|
|
|
|
|
// Even part
|
|
|
|
Vector4 tmp10 = tmp0 + tmp3; |
|
|
|
Vector4 tmp13 = tmp0 - tmp3; |
|
|
|
Vector4 tmp11 = tmp1 + tmp2; |
|
|
|
Vector4 tmp12 = tmp1 - tmp2; |
|
|
|
|
|
|
|
Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11; |
|
|
|
Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11; |
|
|
|
|
|
|
|
Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071; |
|
|
|
Unsafe.Add(ref blockRef, 4) = tmp13 + z1; |
|
|
|
Unsafe.Add(ref blockRef, 12) = tmp13 - z1; |
|
|
|
|
|
|
|
// Odd part
|
|
|
|
tmp10 = tmp4 + tmp5; |
|
|
|
tmp11 = tmp5 + tmp6; |
|
|
|
tmp12 = tmp6 + tmp7; |
|
|
|
|
|
|
|
Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826; |
|
|
|
Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5; |
|
|
|
Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5; |
|
|
|
Vector4 z3 = tmp11 * mm128_F_0_7071; |
|
|
|
|
|
|
|
Vector4 z11 = tmp7 + z3; |
|
|
|
Vector4 z13 = tmp7 - z3; |
|
|
|
|
|
|
|
Unsafe.Add(ref blockRef, 10) = z13 + z2; |
|
|
|
Unsafe.Add(ref blockRef, 6) = z13 - z2; |
|
|
|
Unsafe.Add(ref blockRef, 2) = z11 + z4; |
|
|
|
Unsafe.Add(ref blockRef, 14) = z11 - z4; |
|
|
|
} |
|
|
|
// Applies 1D floating point FDCT inplace on 8x4 part of 8x8 block
|
|
|
|
static void FDCT8x4_Vector4(ref Vector4 vecRef) |
|
|
|
{ |
|
|
|
Vector4 tmp0 = Unsafe.Add(ref vecRef, 0) + Unsafe.Add(ref vecRef, 14); |
|
|
|
Vector4 tmp7 = Unsafe.Add(ref vecRef, 0) - Unsafe.Add(ref vecRef, 14); |
|
|
|
Vector4 tmp1 = Unsafe.Add(ref vecRef, 2) + Unsafe.Add(ref vecRef, 12); |
|
|
|
Vector4 tmp6 = Unsafe.Add(ref vecRef, 2) - Unsafe.Add(ref vecRef, 12); |
|
|
|
Vector4 tmp2 = Unsafe.Add(ref vecRef, 4) + Unsafe.Add(ref vecRef, 10); |
|
|
|
Vector4 tmp5 = Unsafe.Add(ref vecRef, 4) - Unsafe.Add(ref vecRef, 10); |
|
|
|
Vector4 tmp3 = Unsafe.Add(ref vecRef, 6) + Unsafe.Add(ref vecRef, 8); |
|
|
|
Vector4 tmp4 = Unsafe.Add(ref vecRef, 6) - Unsafe.Add(ref vecRef, 8); |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Apply floating point IDCT inplace.
|
|
|
|
/// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="block">Input matrix.</param>
|
|
|
|
/// <param name="temp">Matrix to store temporal results.</param>
|
|
|
|
public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp) |
|
|
|
{ |
|
|
|
block.TransposeInplace(); |
|
|
|
IDCT8x8(ref block, ref temp); |
|
|
|
temp.TransposeInplace(); |
|
|
|
IDCT8x8(ref temp, ref block); |
|
|
|
// Even part
|
|
|
|
Vector4 tmp10 = tmp0 + tmp3; |
|
|
|
Vector4 tmp13 = tmp0 - tmp3; |
|
|
|
Vector4 tmp11 = tmp1 + tmp2; |
|
|
|
Vector4 tmp12 = tmp1 - tmp2; |
|
|
|
|
|
|
|
// TODO: This can be fused into quantization table step
|
|
|
|
block.MultiplyInPlace(C_0_125); |
|
|
|
} |
|
|
|
Unsafe.Add(ref vecRef, 0) = tmp10 + tmp11; |
|
|
|
Unsafe.Add(ref vecRef, 8) = tmp10 - tmp11; |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Performs 8x8 matrix Inverse Discrete Cosine Transform
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="s">Source</param>
|
|
|
|
/// <param name="d">Destination</param>
|
|
|
|
private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) |
|
|
|
{ |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
if (Avx.IsSupported) |
|
|
|
{ |
|
|
|
IDCT8x8_Avx(ref s, ref d); |
|
|
|
} |
|
|
|
else |
|
|
|
#endif
|
|
|
|
{ |
|
|
|
IDCT8x4_LeftPart(ref s, ref d); |
|
|
|
IDCT8x4_RightPart(ref s, ref d); |
|
|
|
} |
|
|
|
} |
|
|
|
Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071; |
|
|
|
Unsafe.Add(ref vecRef, 4) = tmp13 + z1; |
|
|
|
Unsafe.Add(ref vecRef, 12) = tmp13 - z1; |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Do IDCT internal operations on the left part of the block. Original src:
|
|
|
|
/// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="s">The source block</param>
|
|
|
|
/// <param name="d">Destination block</param>
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) |
|
|
|
{ |
|
|
|
Vector4 my1 = s.V1L; |
|
|
|
Vector4 my7 = s.V7L; |
|
|
|
Vector4 mz0 = my1 + my7; |
|
|
|
|
|
|
|
Vector4 my3 = s.V3L; |
|
|
|
Vector4 mz2 = my3 + my7; |
|
|
|
Vector4 my5 = s.V5L; |
|
|
|
Vector4 mz1 = my3 + my5; |
|
|
|
Vector4 mz3 = my1 + my5; |
|
|
|
|
|
|
|
Vector4 mz4 = (mz0 + mz1) * C_1_175876; |
|
|
|
|
|
|
|
mz2 = (mz2 * C_1_961571) + mz4; |
|
|
|
mz3 = (mz3 * C_0_390181) + mz4; |
|
|
|
mz0 = mz0 * C_0_899976; |
|
|
|
mz1 = mz1 * C_2_562915; |
|
|
|
|
|
|
|
Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; |
|
|
|
Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; |
|
|
|
Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; |
|
|
|
Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; |
|
|
|
|
|
|
|
Vector4 my2 = s.V2L; |
|
|
|
Vector4 my6 = s.V6L; |
|
|
|
mz4 = (my2 + my6) * C_0_541196; |
|
|
|
Vector4 my0 = s.V0L; |
|
|
|
Vector4 my4 = s.V4L; |
|
|
|
mz0 = my0 + my4; |
|
|
|
mz1 = my0 - my4; |
|
|
|
|
|
|
|
mz2 = mz4 + (my6 * C_1_847759); |
|
|
|
mz3 = mz4 + (my2 * C_0_765367); |
|
|
|
|
|
|
|
my0 = mz0 + mz3; |
|
|
|
my3 = mz0 - mz3; |
|
|
|
my1 = mz1 + mz2; |
|
|
|
my2 = mz1 - mz2; |
|
|
|
|
|
|
|
d.V0L = my0 + mb0; |
|
|
|
d.V7L = my0 - mb0; |
|
|
|
d.V1L = my1 + mb1; |
|
|
|
d.V6L = my1 - mb1; |
|
|
|
d.V2L = my2 + mb2; |
|
|
|
d.V5L = my2 - mb2; |
|
|
|
d.V3L = my3 + mb3; |
|
|
|
d.V4L = my3 - mb3; |
|
|
|
} |
|
|
|
// Odd part
|
|
|
|
tmp10 = tmp4 + tmp5; |
|
|
|
tmp11 = tmp5 + tmp6; |
|
|
|
tmp12 = tmp6 + tmp7; |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Do IDCT internal operations on the right part of the block.
|
|
|
|
/// Original src:
|
|
|
|
/// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="s">The source block</param>
|
|
|
|
/// <param name="d">The destination block</param>
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) |
|
|
|
{ |
|
|
|
Vector4 my1 = s.V1R; |
|
|
|
Vector4 my7 = s.V7R; |
|
|
|
Vector4 mz0 = my1 + my7; |
|
|
|
|
|
|
|
Vector4 my3 = s.V3R; |
|
|
|
Vector4 mz2 = my3 + my7; |
|
|
|
Vector4 my5 = s.V5R; |
|
|
|
Vector4 mz1 = my3 + my5; |
|
|
|
Vector4 mz3 = my1 + my5; |
|
|
|
|
|
|
|
Vector4 mz4 = (mz0 + mz1) * C_1_175876; |
|
|
|
|
|
|
|
mz2 = (mz2 * C_1_961571) + mz4; |
|
|
|
mz3 = (mz3 * C_0_390181) + mz4; |
|
|
|
mz0 = mz0 * C_0_899976; |
|
|
|
mz1 = mz1 * C_2_562915; |
|
|
|
|
|
|
|
Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; |
|
|
|
Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; |
|
|
|
Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; |
|
|
|
Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; |
|
|
|
|
|
|
|
Vector4 my2 = s.V2R; |
|
|
|
Vector4 my6 = s.V6R; |
|
|
|
mz4 = (my2 + my6) * C_0_541196; |
|
|
|
Vector4 my0 = s.V0R; |
|
|
|
Vector4 my4 = s.V4R; |
|
|
|
mz0 = my0 + my4; |
|
|
|
mz1 = my0 - my4; |
|
|
|
|
|
|
|
mz2 = mz4 + (my6 * C_1_847759); |
|
|
|
mz3 = mz4 + (my2 * C_0_765367); |
|
|
|
|
|
|
|
my0 = mz0 + mz3; |
|
|
|
my3 = mz0 - mz3; |
|
|
|
my1 = mz1 + mz2; |
|
|
|
my2 = mz1 - mz2; |
|
|
|
|
|
|
|
d.V0R = my0 + mb0; |
|
|
|
d.V7R = my0 - mb0; |
|
|
|
d.V1R = my1 + mb1; |
|
|
|
d.V6R = my1 - mb1; |
|
|
|
d.V2R = my2 + mb2; |
|
|
|
d.V5R = my2 - mb2; |
|
|
|
d.V3R = my3 + mb3; |
|
|
|
d.V4R = my3 - mb3; |
|
|
|
Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826; |
|
|
|
Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5; |
|
|
|
Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5; |
|
|
|
Vector4 z3 = tmp11 * mm128_F_0_7071; |
|
|
|
|
|
|
|
Vector4 z11 = tmp7 + z3; |
|
|
|
Vector4 z13 = tmp7 - z3; |
|
|
|
|
|
|
|
Unsafe.Add(ref vecRef, 10) = z13 + z2; |
|
|
|
Unsafe.Add(ref vecRef, 6) = z13 - z2; |
|
|
|
Unsafe.Add(ref vecRef, 2) = z11 + z4; |
|
|
|
Unsafe.Add(ref vecRef, 14) = z11 - z4; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|