Browse Source

Merge branch 'master' into bp/vectoraddavx

pull/1849/head
Brian Popow 4 years ago
committed by GitHub
parent
commit
4e6d96f239
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 2
      shared-infrastructure
  2. 58
      src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
  3. 10
      src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
  4. 8
      src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
  5. 237
      src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
  6. 532
      src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
  7. 29
      src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
  8. 3
      src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
  9. 11
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs
  10. 5
      tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
  11. 26
      tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
  12. 209
      tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
  13. 2
      tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs
  14. 15
      tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
  15. 17
      tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
  16. 14
      tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs

2
shared-infrastructure

@ -1 +1 @@
Subproject commit 33cb12ca77f919b44de56f344d2627cc2a108c3a Subproject commit a042aba176cdb840d800c6ed4cfe41a54fb7b1e3

58
src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs

@ -337,6 +337,64 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
} }
} }
/// <summary>
/// Transpose the block inplace.
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
public void TransposeInplace()
{
ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);
// row #0
Swap(ref Unsafe.Add(ref elemRef, 1), ref Unsafe.Add(ref elemRef, 8));
Swap(ref Unsafe.Add(ref elemRef, 2), ref Unsafe.Add(ref elemRef, 16));
Swap(ref Unsafe.Add(ref elemRef, 3), ref Unsafe.Add(ref elemRef, 24));
Swap(ref Unsafe.Add(ref elemRef, 4), ref Unsafe.Add(ref elemRef, 32));
Swap(ref Unsafe.Add(ref elemRef, 5), ref Unsafe.Add(ref elemRef, 40));
Swap(ref Unsafe.Add(ref elemRef, 6), ref Unsafe.Add(ref elemRef, 48));
Swap(ref Unsafe.Add(ref elemRef, 7), ref Unsafe.Add(ref elemRef, 56));
// row #1
Swap(ref Unsafe.Add(ref elemRef, 10), ref Unsafe.Add(ref elemRef, 17));
Swap(ref Unsafe.Add(ref elemRef, 11), ref Unsafe.Add(ref elemRef, 25));
Swap(ref Unsafe.Add(ref elemRef, 12), ref Unsafe.Add(ref elemRef, 33));
Swap(ref Unsafe.Add(ref elemRef, 13), ref Unsafe.Add(ref elemRef, 41));
Swap(ref Unsafe.Add(ref elemRef, 14), ref Unsafe.Add(ref elemRef, 49));
Swap(ref Unsafe.Add(ref elemRef, 15), ref Unsafe.Add(ref elemRef, 57));
// row #2
Swap(ref Unsafe.Add(ref elemRef, 19), ref Unsafe.Add(ref elemRef, 26));
Swap(ref Unsafe.Add(ref elemRef, 20), ref Unsafe.Add(ref elemRef, 34));
Swap(ref Unsafe.Add(ref elemRef, 21), ref Unsafe.Add(ref elemRef, 42));
Swap(ref Unsafe.Add(ref elemRef, 22), ref Unsafe.Add(ref elemRef, 50));
Swap(ref Unsafe.Add(ref elemRef, 23), ref Unsafe.Add(ref elemRef, 58));
// row #3
Swap(ref Unsafe.Add(ref elemRef, 28), ref Unsafe.Add(ref elemRef, 35));
Swap(ref Unsafe.Add(ref elemRef, 29), ref Unsafe.Add(ref elemRef, 43));
Swap(ref Unsafe.Add(ref elemRef, 30), ref Unsafe.Add(ref elemRef, 51));
Swap(ref Unsafe.Add(ref elemRef, 31), ref Unsafe.Add(ref elemRef, 59));
// row #4
Swap(ref Unsafe.Add(ref elemRef, 37), ref Unsafe.Add(ref elemRef, 44));
Swap(ref Unsafe.Add(ref elemRef, 38), ref Unsafe.Add(ref elemRef, 52));
Swap(ref Unsafe.Add(ref elemRef, 39), ref Unsafe.Add(ref elemRef, 60));
// row #5
Swap(ref Unsafe.Add(ref elemRef, 46), ref Unsafe.Add(ref elemRef, 53));
Swap(ref Unsafe.Add(ref elemRef, 47), ref Unsafe.Add(ref elemRef, 61));
// row #6
Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62));
static void Swap(ref short a, ref short b)
{
short tmp = a;
a = b;
b = tmp;
}
}
/// <summary> /// <summary>
/// Calculate the total sum of absolute differences of elements in 'a' and 'b'. /// Calculate the total sum of absolute differences of elements in 'a' and 'b'.
/// </summary> /// </summary>

10
src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs

@ -502,7 +502,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
{ {
i += r; i += r;
s = buffer.Receive(s); s = buffer.Receive(s);
Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s; Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[i++]) = (short)s;
} }
else else
{ {
@ -571,7 +571,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
if (s != 0) if (s != 0)
{ {
s = buffer.Receive(s); s = buffer.Receive(s);
Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low); Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[i]) = (short)(s << low);
} }
else else
{ {
@ -647,7 +647,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
do do
{ {
ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]); ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]);
if (coef != 0) if (coef != 0)
{ {
buffer.CheckBits(); buffer.CheckBits();
@ -673,7 +673,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
if ((s != 0) && (k < 64)) if ((s != 0) && (k < 64))
{ {
Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s; Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]) = (short)s;
} }
} }
} }
@ -682,7 +682,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
{ {
for (; k <= end; k++) for (; k <= end; k++)
{ {
ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]); ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]);
if (coef != 0) if (coef != 0)
{ {

8
src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs

@ -18,11 +18,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
/// </summary> /// </summary>
public Block8x8F SourceBlock; public Block8x8F SourceBlock;
/// <summary>
/// Temporal block to store intermediate computation results.
/// </summary>
public Block8x8F WorkspaceBlock;
/// <summary> /// <summary>
/// The quantization table as <see cref="Block8x8F"/>. /// The quantization table as <see cref="Block8x8F"/>.
/// </summary> /// </summary>
@ -45,7 +40,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
this.subSamplingDivisors = component.SubSamplingDivisors; this.subSamplingDivisors = component.SubSamplingDivisors;
this.SourceBlock = default; this.SourceBlock = default;
this.WorkspaceBlock = default;
} }
/// <summary> /// <summary>
@ -71,7 +65,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
// Dequantize: // Dequantize:
block.MultiplyInPlace(ref this.DequantiazationTable); block.MultiplyInPlace(ref this.DequantiazationTable);
FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock); FastFloatingPointDCT.TransformIDCT(ref block);
// To conform better to libjpeg we actually NEED TO loose precision here. // To conform better to libjpeg we actually NEED TO loose precision here.
// This is because they store blocks as Int16 between all the operations. // This is because they store blocks as Int16 between all the operations.

237
src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs

@ -2,9 +2,6 @@
// Licensed under the Apache License, Version 2.0. // Licensed under the Apache License, Version 2.0.
#if SUPPORTS_RUNTIME_INTRINSICS #if SUPPORTS_RUNTIME_INTRINSICS
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics; using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics.X86;
@ -12,149 +9,147 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{ {
internal static partial class FastFloatingPointDCT internal static partial class FastFloatingPointDCT
{ {
#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings #pragma warning disable SA1310, SA1311, IDE1006 // naming rule violation warnings
private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f); private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f); private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f);
private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f); private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f); private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);
private static readonly Vector256<float> mm256_F_1_1758 = Vector256.Create(1.175876f); private static readonly Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
private static readonly Vector256<float> mm256_F_n1_9615 = Vector256.Create(-1.961570560f); private static readonly Vector256<float> mm256_F_1_8477 = Vector256.Create(1.847759065f);
private static readonly Vector256<float> mm256_F_n0_3901 = Vector256.Create(-0.390180644f); private static readonly Vector256<float> mm256_F_n1_0823 = Vector256.Create(-1.082392200f);
private static readonly Vector256<float> mm256_F_n0_8999 = Vector256.Create(-0.899976223f); private static readonly Vector256<float> mm256_F_n2_6131 = Vector256.Create(-2.613125930f);
private static readonly Vector256<float> mm256_F_n2_5629 = Vector256.Create(-2.562915447f);
private static readonly Vector256<float> mm256_F_0_2986 = Vector256.Create(0.298631336f);
private static readonly Vector256<float> mm256_F_2_0531 = Vector256.Create(2.053119869f);
private static readonly Vector256<float> mm256_F_3_0727 = Vector256.Create(3.072711026f);
private static readonly Vector256<float> mm256_F_1_5013 = Vector256.Create(1.501321110f);
private static readonly Vector256<float> mm256_F_n1_8477 = Vector256.Create(-1.847759065f);
private static readonly Vector256<float> mm256_F_0_7653 = Vector256.Create(0.765366865f);
#pragma warning restore SA1310, SA1311, IDE1006 #pragma warning restore SA1310, SA1311, IDE1006
/// <summary> /// <summary>
/// Apply floating point FDCT inplace using simd operations. /// Apply floating point FDCT inplace using simd operations.
/// </summary> /// </summary>
/// <param name="block">Input matrix.</param> /// <param name="block">Input block.</param>
private static void ForwardTransform_Avx(ref Block8x8F block) private static void FDCT8x8_Avx(ref Block8x8F block)
{ {
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
// First pass - process rows // First pass - process rows
block.TransposeInplace(); block.TransposeInplace();
FDCT8x8_Avx(ref block); FDCT8x8_1D_Avx(ref block);
// Second pass - process columns // Second pass - process columns
block.TransposeInplace(); block.TransposeInplace();
FDCT8x8_Avx(ref block); FDCT8x8_1D_Avx(ref block);
// Applies 1D floating point FDCT inplace
static void FDCT8x8_1D_Avx(ref Block8x8F block)
{
Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
// Even part
Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
block.V0 = Avx.Add(tmp10, tmp11);
block.V4 = Avx.Subtract(tmp10, tmp11);
Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
block.V2 = Avx.Add(tmp13, z1);
block.V6 = Avx.Subtract(tmp13, z1);
// Odd part
tmp10 = Avx.Add(tmp4, tmp5);
tmp11 = Avx.Add(tmp5, tmp6);
tmp12 = Avx.Add(tmp6, tmp7);
Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
Vector256<float> z11 = Avx.Add(tmp7, z3);
Vector256<float> z13 = Avx.Subtract(tmp7, z3);
block.V5 = Avx.Add(z13, z2);
block.V3 = Avx.Subtract(z13, z2);
block.V1 = Avx.Add(z11, z4);
block.V7 = Avx.Subtract(z11, z4);
}
} }
/// <summary> /// <summary>
/// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix. /// Apply floating point IDCT inplace using simd operations.
/// </summary> /// </summary>
/// <remarks> /// <param name="transposedBlock">Transposed input block.</param>
/// Requires Avx support. private static void IDCT8x8_Avx(ref Block8x8F transposedBlock)
/// </remarks>
/// <param name="block">Input matrix.</param>
public static void FDCT8x8_Avx(ref Block8x8F block)
{ {
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
Vector256<float> tmp0 = Avx.Add(block.V0, block.V7); // First pass - process columns
Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7); IDCT8x8_1D_Avx(ref transposedBlock);
Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6); // Second pass - process rows
Vector256<float> tmp2 = Avx.Add(block.V2, block.V5); transposedBlock.TransposeInplace();
Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5); IDCT8x8_1D_Avx(ref transposedBlock);
Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4); // Applies 1D floating point FDCT inplace
static void IDCT8x8_1D_Avx(ref Block8x8F block)
// Even part {
Vector256<float> tmp10 = Avx.Add(tmp0, tmp3); // Even part
Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3); Vector256<float> tmp0 = block.V0;
Vector256<float> tmp11 = Avx.Add(tmp1, tmp2); Vector256<float> tmp1 = block.V2;
Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2); Vector256<float> tmp2 = block.V4;
Vector256<float> tmp3 = block.V6;
block.V0 = Avx.Add(tmp10, tmp11);
block.V4 = Avx.Subtract(tmp10, tmp11); Vector256<float> z5 = tmp0;
Vector256<float> tmp10 = Avx.Add(z5, tmp2);
Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); Vector256<float> tmp11 = Avx.Subtract(z5, tmp2);
block.V2 = Avx.Add(tmp13, z1);
block.V6 = Avx.Subtract(tmp13, z1); Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
// Odd part
tmp10 = Avx.Add(tmp4, tmp5); tmp0 = Avx.Add(tmp10, tmp13);
tmp11 = Avx.Add(tmp5, tmp6); tmp3 = Avx.Subtract(tmp10, tmp13);
tmp12 = Avx.Add(tmp6, tmp7); tmp1 = Avx.Add(tmp11, tmp12);
tmp2 = Avx.Subtract(tmp11, tmp12);
Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10); // Odd part
Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12); Vector256<float> tmp4 = block.V1;
Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071); Vector256<float> tmp5 = block.V3;
Vector256<float> tmp6 = block.V5;
Vector256<float> z11 = Avx.Add(tmp7, z3); Vector256<float> tmp7 = block.V7;
Vector256<float> z13 = Avx.Subtract(tmp7, z3);
Vector256<float> z13 = Avx.Add(tmp6, tmp5);
block.V5 = Avx.Add(z13, z2); Vector256<float> z10 = Avx.Subtract(tmp6, tmp5);
block.V3 = Avx.Subtract(z13, z2); Vector256<float> z11 = Avx.Add(tmp4, tmp7);
block.V1 = Avx.Add(z11, z4); Vector256<float> z12 = Avx.Subtract(tmp4, tmp7);
block.V7 = Avx.Subtract(z11, z4);
} tmp7 = Avx.Add(z11, z13);
tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);
/// <summary>
/// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/> z5 = Avx.Multiply(Avx.Add(z10, z12), mm256_F_1_8477);
/// using AVX commands.
/// </summary> tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, mm256_F_n1_0823);
/// <param name="s">Source</param> tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, mm256_F_n2_6131);
/// <param name="d">Destination</param>
public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) tmp6 = Avx.Subtract(tmp12, tmp7);
{ tmp5 = Avx.Subtract(tmp11, tmp6);
Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); tmp4 = Avx.Subtract(tmp10, tmp5);
Vector256<float> my1 = s.V1; block.V0 = Avx.Add(tmp0, tmp7);
Vector256<float> my7 = s.V7; block.V7 = Avx.Subtract(tmp0, tmp7);
Vector256<float> mz0 = Avx.Add(my1, my7); block.V1 = Avx.Add(tmp1, tmp6);
block.V6 = Avx.Subtract(tmp1, tmp6);
Vector256<float> my3 = s.V3; block.V2 = Avx.Add(tmp2, tmp5);
Vector256<float> mz2 = Avx.Add(my3, my7); block.V5 = Avx.Subtract(tmp2, tmp5);
Vector256<float> my5 = s.V5; block.V3 = Avx.Add(tmp3, tmp4);
Vector256<float> mz1 = Avx.Add(my3, my5); block.V4 = Avx.Subtract(tmp3, tmp4);
Vector256<float> mz3 = Avx.Add(my1, my5); }
Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758);
mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615);
mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901);
mz0 = Avx.Multiply(mz0, mm256_F_n0_8999);
mz1 = Avx.Multiply(mz1, mm256_F_n2_5629);
Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2);
Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3);
Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2);
Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3);
Vector256<float> my2 = s.V2;
Vector256<float> my6 = s.V6;
mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411);
Vector256<float> my0 = s.V0;
Vector256<float> my4 = s.V4;
mz0 = Avx.Add(my0, my4);
mz1 = Avx.Subtract(my0, my4);
mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477);
mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653);
my0 = Avx.Add(mz0, mz3);
my3 = Avx.Subtract(mz0, mz3);
my1 = Avx.Add(mz1, mz2);
my2 = Avx.Subtract(mz1, mz2);
d.V0 = Avx.Add(my0, mb0);
d.V7 = Avx.Subtract(my0, mb0);
d.V1 = Avx.Add(my1, mb1);
d.V6 = Avx.Subtract(my1, mb1);
d.V2 = Avx.Add(my2, mb2);
d.V5 = Avx.Subtract(my2, mb2);
d.V3 = Avx.Add(my3, mb3);
d.V4 = Avx.Subtract(my3, mb3);
} }
} }
} }

532
src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs

@ -3,6 +3,7 @@
using System.Numerics; using System.Numerics;
using System.Runtime.CompilerServices; using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS #if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics.X86;
#endif #endif
@ -15,102 +16,202 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
/// </summary> /// </summary>
internal static partial class FastFloatingPointDCT internal static partial class FastFloatingPointDCT
{ {
#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore #pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
private const float C_1_175876 = 1.175875602f; private static readonly Vector4 mm128_F_0_7071 = new(0.707106781f);
private const float C_1_961571 = -1.961570560f; private static readonly Vector4 mm128_F_0_3826 = new(0.382683433f);
private const float C_0_390181 = -0.390180644f; private static readonly Vector4 mm128_F_0_5411 = new(0.541196100f);
private const float C_0_899976 = -0.899976223f; private static readonly Vector4 mm128_F_1_3065 = new(1.306562965f);
private const float C_2_562915 = -2.562915447f;
private const float C_0_298631 = 0.298631336f; private static readonly Vector4 mm128_F_1_4142 = new(1.414213562f);
private const float C_2_053120 = 2.053119869f; private static readonly Vector4 mm128_F_1_8477 = new(1.847759065f);
private const float C_3_072711 = 3.072711026f; private static readonly Vector4 mm128_F_n1_0823 = new(-1.082392200f);
private const float C_1_501321 = 1.501321110f; private static readonly Vector4 mm128_F_n2_6131 = new(-2.613125930f);
private const float C_0_541196 = 0.541196100f; #pragma warning restore SA1310, SA1311, IDE1006
private const float C_1_847759 = -1.847759065f;
private const float C_0_765367 = 0.765366865f;
private const float C_0_125 = 0.1250f;
#pragma warning disable SA1311, IDE1006 // naming rules violation warnings
private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f);
private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f);
private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f);
private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f);
#pragma warning restore SA1311, IDE1006
#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
/// <summary> /// <summary>
/// Gets reciprocal coefficients for jpeg quantization tables calculation. /// Gets adjustment table for quantization tables.
/// </summary> /// </summary>
/// <remarks> /// <remarks>
/// <para> /// <para>
/// Current FDCT implementation expects its results to be multiplied by /// Current IDCT and FDCT implementations are based on Arai, Agui,
/// a reciprocal quantization table. To get 8x8 reciprocal block values in this /// and Nakajima's algorithm. Both DCT methods does not
/// table must be divided by quantization table values scaled with quality settings. /// produce finished DCT output, final step is fused into the
/// quantization step. Quantization and de-quantization coefficients
/// must be multiplied by these values.
/// </para> /// </para>
/// <para> /// <para>
/// These values were calculates with this formula: /// Given values were generated by formula:
/// <code>
/// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
/// </code>
/// Where:
/// <code> /// <code>
/// scalefactor[row] * scalefactor[col], where
/// scalefactor[0] = 1 /// scalefactor[0] = 1
/// </code>
/// <code>
/// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
/// </code> /// </code>
/// Values are also scaled by 8 so DCT code won't do extra division/multiplication.
/// </para> /// </para>
/// </remarks> /// </remarks>
internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[] private static readonly float[] AdjustmentCoefficients = new float[]
{ {
0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, 1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f,
0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, 1.3870399f, 1.9238797f, 1.812255f, 1.6309863f, 1.3870399f, 1.0897902f, 0.7506606f, 0.38268346f,
0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, 1.306563f, 1.812255f, 1.707107f, 1.5363555f, 1.306563f, 1.02656f, 0.7071068f, 0.36047992f,
0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, 1.1758755f, 1.6309863f, 1.5363555f, 1.3826833f, 1.1758755f, 0.9238795f, 0.63637924f, 0.32442334f,
0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, 1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f,
0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, 0.78569496f, 1.0897902f, 1.02656f, 0.9238795f, 0.78569496f, 0.61731654f, 0.42521507f, 0.21677275f,
0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, 0.5411961f, 0.7506606f, 0.7071068f, 0.63637924f, 0.5411961f, 0.42521507f, 0.29289323f, 0.14931567f,
0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, 0.27589938f, 0.38268346f, 0.36047992f, 0.32442334f, 0.27589938f, 0.21677275f, 0.14931567f, 0.076120466f,
}; };
/// <summary> /// <summary>
/// Adjusts given quantization table to be complient with FDCT implementation. /// Adjusts given quantization table for usage with <see cref="TransformIDCT"/>.
/// </summary>
/// <param name="quantTable">Quantization table to adjust.</param>
public static void AdjustToIDCT(ref Block8x8F quantTable)
{
ref float tableRef = ref Unsafe.As<Block8x8F, float>(ref quantTable);
ref float multipliersRef = ref MemoryMarshal.GetReference<float>(AdjustmentCoefficients);
for (nint i = 0; i < Block8x8F.Size; i++)
{
tableRef = 0.125f * tableRef * Unsafe.Add(ref multipliersRef, i);
tableRef = ref Unsafe.Add(ref tableRef, 1);
}
// Spectral macroblocks are transposed before quantization
// so we must transpose quantization table
quantTable.TransposeInplace();
}
/// <summary>
/// Adjusts given quantization table for usage with <see cref="TransformFDCT"/>.
/// </summary>
/// <param name="quantTable">Quantization table to adjust.</param>
public static void AdjustToFDCT(ref Block8x8F quantTable)
{
ref float tableRef = ref Unsafe.As<Block8x8F, float>(ref quantTable);
ref float multipliersRef = ref MemoryMarshal.GetReference<float>(AdjustmentCoefficients);
for (nint i = 0; i < Block8x8F.Size; i++)
{
tableRef = 0.125f / (tableRef * Unsafe.Add(ref multipliersRef, i));
tableRef = ref Unsafe.Add(ref tableRef, 1);
}
}
/// <summary>
/// Apply 2D floating point IDCT inplace.
/// </summary> /// </summary>
/// <remarks> /// <remarks>
/// See <see cref="DctReciprocalAdjustmentCoefficients"/> docs for explanation. /// Input block must be dequantized before this method with table
/// adjusted by <see cref="AdjustToIDCT"/>.
/// </remarks> /// </remarks>
/// <param name="quantizationtable">Quantization table to adjust.</param> /// <param name="block">Input block.</param>
public static void AdjustToFDCT(ref Block8x8F quantizationtable) public static void TransformIDCT(ref Block8x8F block)
{ {
for (int i = 0; i < Block8x8F.Size; i++) #if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{ {
quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i]; IDCT8x8_Avx(ref block);
}
else
#endif
{
IDCT_Vector4(ref block);
} }
} }
/// <summary> /// <summary>
/// Apply 2D floating point FDCT inplace. /// Apply 2D floating point IDCT inplace.
/// </summary> /// </summary>
/// <param name="block">Input matrix.</param> /// <remarks>
/// Input block must be quantized after this method with table adjusted
/// by <see cref="AdjustToFDCT"/>.
/// </remarks>
/// <param name="block">Input block.</param>
public static void TransformFDCT(ref Block8x8F block) public static void TransformFDCT(ref Block8x8F block)
{ {
#if SUPPORTS_RUNTIME_INTRINSICS #if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported) if (Avx.IsSupported)
{ {
ForwardTransform_Avx(ref block); FDCT8x8_Avx(ref block);
} }
else else
#endif #endif
if (Vector.IsHardwareAccelerated) if (Vector.IsHardwareAccelerated)
{ {
ForwardTransform_Vector4(ref block); FDCT_Vector4(ref block);
} }
else else
{ {
ForwardTransform_Scalar(ref block); FDCT_Scalar(ref block);
}
}
/// <summary>
/// Apply floating point IDCT inplace using <see cref="Vector4"/> API.
/// </summary>
/// <param name="transposedBlock">Input block.</param>
private static void IDCT_Vector4(ref Block8x8F transposedBlock)
{
DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
// First pass - process columns
IDCT8x4_Vector4(ref transposedBlock.V0L);
IDCT8x4_Vector4(ref transposedBlock.V0R);
// Second pass - process rows
transposedBlock.TransposeInplace();
IDCT8x4_Vector4(ref transposedBlock.V0L);
IDCT8x4_Vector4(ref transposedBlock.V0R);
// Applies 1D floating point IDCT inplace on 8x4 part of 8x8 block
static void IDCT8x4_Vector4(ref Vector4 vecRef)
{
// Even part
Vector4 tmp0 = Unsafe.Add(ref vecRef, 0 * 2);
Vector4 tmp1 = Unsafe.Add(ref vecRef, 2 * 2);
Vector4 tmp2 = Unsafe.Add(ref vecRef, 4 * 2);
Vector4 tmp3 = Unsafe.Add(ref vecRef, 6 * 2);
Vector4 z5 = tmp0;
Vector4 tmp10 = z5 + tmp2;
Vector4 tmp11 = z5 - tmp2;
Vector4 tmp13 = tmp1 + tmp3;
Vector4 tmp12 = ((tmp1 - tmp3) * mm128_F_1_4142) - tmp13;
tmp0 = tmp10 + tmp13;
tmp3 = tmp10 - tmp13;
tmp1 = tmp11 + tmp12;
tmp2 = tmp11 - tmp12;
// Odd part
Vector4 tmp4 = Unsafe.Add(ref vecRef, 1 * 2);
Vector4 tmp5 = Unsafe.Add(ref vecRef, 3 * 2);
Vector4 tmp6 = Unsafe.Add(ref vecRef, 5 * 2);
Vector4 tmp7 = Unsafe.Add(ref vecRef, 7 * 2);
Vector4 z13 = tmp6 + tmp5;
Vector4 z10 = tmp6 - tmp5;
Vector4 z11 = tmp4 + tmp7;
Vector4 z12 = tmp4 - tmp7;
tmp7 = z11 + z13;
tmp11 = (z11 - z13) * mm128_F_1_4142;
z5 = (z10 + z12) * mm128_F_1_8477;
tmp10 = (z12 * mm128_F_n1_0823) + z5;
tmp12 = (z10 * mm128_F_n2_6131) + z5;
tmp6 = tmp12 - tmp7;
tmp5 = tmp11 - tmp6;
tmp4 = tmp10 - tmp5;
Unsafe.Add(ref vecRef, 0 * 2) = tmp0 + tmp7;
Unsafe.Add(ref vecRef, 7 * 2) = tmp0 - tmp7;
Unsafe.Add(ref vecRef, 1 * 2) = tmp1 + tmp6;
Unsafe.Add(ref vecRef, 6 * 2) = tmp1 - tmp6;
Unsafe.Add(ref vecRef, 2 * 2) = tmp2 + tmp5;
Unsafe.Add(ref vecRef, 5 * 2) = tmp2 - tmp5;
Unsafe.Add(ref vecRef, 3 * 2) = tmp3 + tmp4;
Unsafe.Add(ref vecRef, 4 * 2) = tmp3 - tmp4;
} }
} }
@ -120,8 +221,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
/// <remarks> /// <remarks>
/// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c. /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
/// </remarks> /// </remarks>
/// <param name="block">Input matrix.</param> /// <param name="block">Input block.</param>
private static void ForwardTransform_Scalar(ref Block8x8F block) private static void FDCT_Scalar(ref Block8x8F block)
{ {
const int dctSize = 8; const int dctSize = 8;
@ -130,17 +231,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
float z1, z2, z3, z4, z5, z11, z13; float z1, z2, z3, z4, z5, z11, z13;
// First pass - process rows // First pass - process rows
ref float dataRef = ref Unsafe.As<Block8x8F, float>(ref block); ref float blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
for (int ctr = 7; ctr >= 0; ctr--) for (int ctr = 7; ctr >= 0; ctr--)
{ {
tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7); tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7);
tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7); tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7);
tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6); tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6);
tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6); tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6);
tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5); tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5);
tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5); tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5);
tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4); tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4);
tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4); tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4);
// Even part // Even part
tmp10 = tmp0 + tmp3; tmp10 = tmp0 + tmp3;
@ -148,12 +249,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
tmp11 = tmp1 + tmp2; tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2; tmp12 = tmp1 - tmp2;
Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11; Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11; Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11;
z1 = (tmp12 + tmp13) * 0.707106781f; z1 = (tmp12 + tmp13) * 0.707106781f;
Unsafe.Add(ref dataRef, 2) = tmp13 + z1; Unsafe.Add(ref blockRef, 2) = tmp13 + z1;
Unsafe.Add(ref dataRef, 6) = tmp13 - z1; Unsafe.Add(ref blockRef, 6) = tmp13 - z1;
// Odd part // Odd part
tmp10 = tmp4 + tmp5; tmp10 = tmp4 + tmp5;
@ -168,26 +269,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
z11 = tmp7 + z3; z11 = tmp7 + z3;
z13 = tmp7 - z3; z13 = tmp7 - z3;
Unsafe.Add(ref dataRef, 5) = z13 + z2; Unsafe.Add(ref blockRef, 5) = z13 + z2;
Unsafe.Add(ref dataRef, 3) = z13 - z2; Unsafe.Add(ref blockRef, 3) = z13 - z2;
Unsafe.Add(ref dataRef, 1) = z11 + z4; Unsafe.Add(ref blockRef, 1) = z11 + z4;
Unsafe.Add(ref dataRef, 7) = z11 - z4; Unsafe.Add(ref blockRef, 7) = z11 - z4;
dataRef = ref Unsafe.Add(ref dataRef, dctSize); blockRef = ref Unsafe.Add(ref blockRef, dctSize);
} }
// Second pass - process columns // Second pass - process columns
dataRef = ref Unsafe.As<Block8x8F, float>(ref block); blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
for (int ctr = 7; ctr >= 0; ctr--) for (int ctr = 7; ctr >= 0; ctr--)
{ {
tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7); tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7);
tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7); tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7);
tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6); tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6);
tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6); tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6);
tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5); tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5);
tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5); tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5);
tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4); tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4);
tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4); tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4);
// Even part // Even part
tmp10 = tmp0 + tmp3; tmp10 = tmp0 + tmp3;
@ -195,12 +296,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
tmp11 = tmp1 + tmp2; tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2; tmp12 = tmp1 - tmp2;
Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11; Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11;
Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11; Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11;
z1 = (tmp12 + tmp13) * 0.707106781f; z1 = (tmp12 + tmp13) * 0.707106781f;
Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1; Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1;
Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1; Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1;
// Odd part // Odd part
tmp10 = tmp4 + tmp5; tmp10 = tmp4 + tmp5;
@ -215,12 +316,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
z11 = tmp7 + z3; z11 = tmp7 + z3;
z13 = tmp7 - z3; z13 = tmp7 - z3;
Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2; Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2;
Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2; Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2;
Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4; Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4;
Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4; Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4;
dataRef = ref Unsafe.Add(ref dataRef, 1); blockRef = ref Unsafe.Add(ref blockRef, 1);
} }
} }
@ -230,11 +331,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
/// <remarks> /// <remarks>
/// This implementation must be called only if hardware supports 4 /// This implementation must be called only if hardware supports 4
/// floating point numbers vector. Otherwise explicit scalar /// floating point numbers vector. Otherwise explicit scalar
/// implementation <see cref="ForwardTransform_Scalar"/> is faster /// implementation <see cref="FDCT_Scalar"/> is faster
/// because it does not rely on matrix transposition. /// because it does not rely on block transposition.
/// </remarks> /// </remarks>
/// <param name="block">Input matrix.</param> /// <param name="block">Input block.</param>
private static void ForwardTransform_Vector4(ref Block8x8F block) public static void FDCT_Vector4(ref Block8x8F block)
{ {
DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware."); DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
@ -247,209 +348,50 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
block.TransposeInplace(); block.TransposeInplace();
FDCT8x4_Vector4(ref block.V0L); FDCT8x4_Vector4(ref block.V0L);
FDCT8x4_Vector4(ref block.V0R); FDCT8x4_Vector4(ref block.V0R);
}
/// <summary> // Applies 1D floating point FDCT inplace on 8x4 part of 8x8 block
/// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix. static void FDCT8x4_Vector4(ref Vector4 vecRef)
/// </summary> {
/// <remarks> Vector4 tmp0 = Unsafe.Add(ref vecRef, 0) + Unsafe.Add(ref vecRef, 14);
/// Implemented using Vector4 API operations for either scalar or sse hardware implementation. Vector4 tmp7 = Unsafe.Add(ref vecRef, 0) - Unsafe.Add(ref vecRef, 14);
/// Must be called on both 8x4 matrix parts for the full FDCT transform. Vector4 tmp1 = Unsafe.Add(ref vecRef, 2) + Unsafe.Add(ref vecRef, 12);
/// </remarks> Vector4 tmp6 = Unsafe.Add(ref vecRef, 2) - Unsafe.Add(ref vecRef, 12);
/// <param name="blockRef">Input reference to the first </param> Vector4 tmp2 = Unsafe.Add(ref vecRef, 4) + Unsafe.Add(ref vecRef, 10);
private static void FDCT8x4_Vector4(ref Vector4 blockRef) Vector4 tmp5 = Unsafe.Add(ref vecRef, 4) - Unsafe.Add(ref vecRef, 10);
{ Vector4 tmp3 = Unsafe.Add(ref vecRef, 6) + Unsafe.Add(ref vecRef, 8);
Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14); Vector4 tmp4 = Unsafe.Add(ref vecRef, 6) - Unsafe.Add(ref vecRef, 8);
Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14);
Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12);
Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12);
Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10);
Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10);
Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8);
Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8);
// Even part
Vector4 tmp10 = tmp0 + tmp3;
Vector4 tmp13 = tmp0 - tmp3;
Vector4 tmp11 = tmp1 + tmp2;
Vector4 tmp12 = tmp1 - tmp2;
Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11;
Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
Unsafe.Add(ref blockRef, 4) = tmp13 + z1;
Unsafe.Add(ref blockRef, 12) = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
Vector4 z3 = tmp11 * mm128_F_0_7071;
Vector4 z11 = tmp7 + z3;
Vector4 z13 = tmp7 - z3;
Unsafe.Add(ref blockRef, 10) = z13 + z2;
Unsafe.Add(ref blockRef, 6) = z13 - z2;
Unsafe.Add(ref blockRef, 2) = z11 + z4;
Unsafe.Add(ref blockRef, 14) = z11 - z4;
}
/// <summary> // Even part
/// Apply floating point IDCT inplace. Vector4 tmp10 = tmp0 + tmp3;
/// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. Vector4 tmp13 = tmp0 - tmp3;
/// </summary> Vector4 tmp11 = tmp1 + tmp2;
/// <param name="block">Input matrix.</param> Vector4 tmp12 = tmp1 - tmp2;
/// <param name="temp">Matrix to store temporal results.</param>
public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
{
block.TransposeInplace();
IDCT8x8(ref block, ref temp);
temp.TransposeInplace();
IDCT8x8(ref temp, ref block);
// TODO: This can be fused into quantization table step Unsafe.Add(ref vecRef, 0) = tmp10 + tmp11;
block.MultiplyInPlace(C_0_125); Unsafe.Add(ref vecRef, 8) = tmp10 - tmp11;
}
/// <summary> Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
/// Performs 8x8 matrix Inverse Discrete Cosine Transform Unsafe.Add(ref vecRef, 4) = tmp13 + z1;
/// </summary> Unsafe.Add(ref vecRef, 12) = tmp13 - z1;
/// <param name="s">Source</param>
/// <param name="d">Destination</param>
private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{
IDCT8x8_Avx(ref s, ref d);
}
else
#endif
{
IDCT8x4_LeftPart(ref s, ref d);
IDCT8x4_RightPart(ref s, ref d);
}
}
/// <summary> // Odd part
/// Do IDCT internal operations on the left part of the block. Original src: tmp10 = tmp4 + tmp5;
/// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 tmp11 = tmp5 + tmp6;
/// </summary> tmp12 = tmp6 + tmp7;
/// <param name="s">The source block</param>
/// <param name="d">Destination block</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
{
Vector4 my1 = s.V1L;
Vector4 my7 = s.V7L;
Vector4 mz0 = my1 + my7;
Vector4 my3 = s.V3L;
Vector4 mz2 = my3 + my7;
Vector4 my5 = s.V5L;
Vector4 mz1 = my3 + my5;
Vector4 mz3 = my1 + my5;
Vector4 mz4 = (mz0 + mz1) * C_1_175876;
mz2 = (mz2 * C_1_961571) + mz4;
mz3 = (mz3 * C_0_390181) + mz4;
mz0 = mz0 * C_0_899976;
mz1 = mz1 * C_2_562915;
Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
Vector4 my2 = s.V2L;
Vector4 my6 = s.V6L;
mz4 = (my2 + my6) * C_0_541196;
Vector4 my0 = s.V0L;
Vector4 my4 = s.V4L;
mz0 = my0 + my4;
mz1 = my0 - my4;
mz2 = mz4 + (my6 * C_1_847759);
mz3 = mz4 + (my2 * C_0_765367);
my0 = mz0 + mz3;
my3 = mz0 - mz3;
my1 = mz1 + mz2;
my2 = mz1 - mz2;
d.V0L = my0 + mb0;
d.V7L = my0 - mb0;
d.V1L = my1 + mb1;
d.V6L = my1 - mb1;
d.V2L = my2 + mb2;
d.V5L = my2 - mb2;
d.V3L = my3 + mb3;
d.V4L = my3 - mb3;
}
/// <summary> Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
/// Do IDCT internal operations on the right part of the block. Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
/// Original src: Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
/// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 Vector4 z3 = tmp11 * mm128_F_0_7071;
/// </summary>
/// <param name="s">The source block</param> Vector4 z11 = tmp7 + z3;
/// <param name="d">The destination block</param> Vector4 z13 = tmp7 - z3;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) Unsafe.Add(ref vecRef, 10) = z13 + z2;
{ Unsafe.Add(ref vecRef, 6) = z13 - z2;
Vector4 my1 = s.V1R; Unsafe.Add(ref vecRef, 2) = z11 + z4;
Vector4 my7 = s.V7R; Unsafe.Add(ref vecRef, 14) = z11 - z4;
Vector4 mz0 = my1 + my7; }
Vector4 my3 = s.V3R;
Vector4 mz2 = my3 + my7;
Vector4 my5 = s.V5R;
Vector4 mz1 = my3 + my5;
Vector4 mz3 = my1 + my5;
Vector4 mz4 = (mz0 + mz1) * C_1_175876;
mz2 = (mz2 * C_1_961571) + mz4;
mz3 = (mz3 * C_0_390181) + mz4;
mz0 = mz0 * C_0_899976;
mz1 = mz1 * C_2_562915;
Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
Vector4 my2 = s.V2R;
Vector4 my6 = s.V6R;
mz4 = (my2 + my6) * C_0_541196;
Vector4 my0 = s.V0R;
Vector4 my4 = s.V4R;
mz0 = my0 + my4;
mz1 = my0 - my4;
mz2 = mz4 + (my6 * C_1_847759);
mz3 = mz4 + (my2 * C_0_765367);
my0 = mz0 + mz3;
my3 = mz0 - mz3;
my1 = mz1 + mz2;
my2 = mz1 - mz2;
d.V0R = my0 + mb0;
d.V7R = my0 - mb0;
d.V1R = my1 + mb1;
d.V6R = my1 - mb1;
d.V2R = my2 + mb2;
d.V5R = my2 - mb2;
d.V3R = my3 + mb3;
d.V4R = my3 - mb3;
} }
} }
} }

29
src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs

@ -35,5 +35,34 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63 63, 63, 63, 63, 63, 63, 63, 63
}; };
/// <summary>
/// Gets span of zig-zag with fused transpose step ordering indices.
/// </summary>
/// <remarks>
/// When reading corrupted data, the Huffman decoders could attempt
/// to reference an entry beyond the end of this array (if the decoded
/// zero run length reaches past the end of the block). To prevent
/// wild stores without adding an inner-loop test, we put some extra
/// "63"s after the real entries. This will cause the extra coefficient
/// to be stored in location 63 of the block, not somewhere random.
/// The worst case would be a run-length of 15, which means we need 16
/// fake entries.
/// </remarks>
public static ReadOnlySpan<byte> TransposingOrder => new byte[]
{
0, 8, 1, 2, 9, 16, 24, 17,
10, 3, 4, 11, 18, 25, 32, 40,
33, 26, 19, 12, 5, 6, 13, 20,
27, 34, 41, 48, 56, 49, 42, 35,
28, 21, 14, 7, 15, 22, 29, 36,
43, 50, 57, 58, 51, 44, 37, 30,
23, 31, 38, 45, 52, 59, 60, 53,
46, 39, 47, 54, 61, 62, 55, 63,
// Extra entries for safety in decoder
63, 63, 63, 63, 63, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63
};
} }
} }

3
src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs

@ -942,6 +942,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
break; break;
} }
} }
// Adjusting table for IDCT step during decompression
FastFloatingPointDCT.AdjustToIDCT(ref table);
} }
} }

11
tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs

@ -66,16 +66,17 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
/* /*
BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1288 (20H2/October2020Update) BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1348 (20H2/October2020Update)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.100-preview.3.21202.5 .NET SDK=6.0.100-preview.3.21202.5
[Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
DefaultJob : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT DefaultJob : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
| Method | Mean | Error | StdDev | | Method | Mean | Error | StdDev |
|------------------------------------ |----------:|----------:|----------:| |------------------------------------ |----------:|----------:|----------:|
| 'Baseline 4:4:4 Interleaved' | 11.781 ms | 0.0737 ms | 0.0654 ms | | 'Baseline 4:4:4 Interleaved' | 11.127 ms | 0.0659 ms | 0.0550 ms |
| 'Baseline 4:2:0 Interleaved' | 8.688 ms | 0.0345 ms | 0.0306 ms | | 'Baseline 4:2:0 Interleaved' | 8.458 ms | 0.0289 ms | 0.0256 ms |
| 'Baseline 4:0:0 (grayscale)' | 1.643 ms | 0.0092 ms | 0.0086 ms | | 'Baseline 4:0:0 (grayscale)' | 1.550 ms | 0.0050 ms | 0.0044 ms |
| 'Progressive 4:2:0 Non-Interleaved' | 13.770 ms | 0.0928 ms | 0.0823 ms | | 'Progressive 4:2:0 Non-Interleaved' | 13.220 ms | 0.0449 ms | 0.0398 ms |
*/ */

5
tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs

@ -183,9 +183,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
Assert.Equal(expected, actual); Assert.Equal(expected, actual);
} }
// This method has only 2 implementations:
// 1. AVX
// 2. Scalar
FeatureTestRunner.RunWithHwIntrinsicsFeature( FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest, RunTest,
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic); HwIntrinsics.AllowAll | HwIntrinsics.DisableHWIntrinsic);
} }
private static float[] Create8x8ColorCropTestData() private static float[] Create8x8ColorCropTestData()

26
tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs

@ -276,5 +276,31 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
seed, seed,
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
} }
[Fact]
public void TransposeInplace()
{
static void RunTest()
{
short[] expected = Create8x8ShortData();
ReferenceImplementations.Transpose8x8(expected);
var block8x8 = default(Block8x8);
block8x8.LoadFrom(Create8x8ShortData());
block8x8.TransposeInplace();
short[] actual = new short[64];
block8x8.CopyTo(actual);
Assert.Equal(expected, actual);
}
// This method has only 1 implementation:
// 1. Scalar
FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest,
HwIntrinsics.DisableHWIntrinsic);
}
} }
} }

209
tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs

@ -2,9 +2,6 @@
// Licensed under the Apache License, Version 2.0. // Licensed under the Apache License, Version 2.0.
using System; using System;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics.X86;
#endif
using SixLabors.ImageSharp.Formats.Jpeg.Components; using SixLabors.ImageSharp.Formats.Jpeg.Components;
using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils; using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
using SixLabors.ImageSharp.Tests.TestUtilities; using SixLabors.ImageSharp.Tests.TestUtilities;
@ -17,6 +14,20 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
[Trait("Format", "Jpg")] [Trait("Format", "Jpg")]
public static class DCTTests public static class DCTTests
{ {
private const int MaxAllowedValue = short.MaxValue;
private const int MinAllowedValue = short.MinValue;
internal static Block8x8F CreateBlockFromScalar(float value)
{
Block8x8F result = default;
for (int i = 0; i < Block8x8F.Size; i++)
{
result[i] = value;
}
return result;
}
public class FastFloatingPoint : JpegFixture public class FastFloatingPoint : JpegFixture
{ {
public FastFloatingPoint(ITestOutputHelper output) public FastFloatingPoint(ITestOutputHelper output)
@ -24,130 +35,75 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
{ {
} }
// Reference tests
[Theory] [Theory]
[InlineData(1)] [InlineData(1)]
[InlineData(2)] [InlineData(2)]
[InlineData(3)] [InlineData(3)]
public void LLM_TransformIDCT_CompareToNonOptimized(int seed) public void LLM_TransformIDCT_CompareToNonOptimized(int seed)
{ {
float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); float[] sourceArray = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
var srcBlock = Block8x8F.Load(sourceArray); var srcBlock = Block8x8F.Load(sourceArray);
// reference
Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock); Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock);
var temp = default(Block8x8F); // testee
FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp); // Part of the IDCT calculations is fused into the quantization step
// We must multiply input block with adjusted no-quantization matrix
this.CompareBlocks(expected, srcBlock, 1f); // before applying IDCT
} // Dequantization using unit matrix - no values are upscaled
Block8x8F dequantMatrix = CreateBlockFromScalar(1);
[Theory]
[InlineData(1)]
[InlineData(2)]
[InlineData(3)]
public void LLM_TransformIDCT_CompareToAccurate(int seed)
{
float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
var srcBlock = Block8x8F.Load(sourceArray); // This step is needed to apply adjusting multipliers to the input block
FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock); // IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace();
srcBlock.MultiplyInPlace(ref dequantMatrix);
var temp = default(Block8x8F); // IDCT calculation
FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp); FastFloatingPointDCT.TransformIDCT(ref srcBlock);
this.CompareBlocks(expected, srcBlock, 1f); this.CompareBlocks(expected, srcBlock, 1f);
} }
// Inverse transform
[Theory]
[InlineData(1)]
[InlineData(2)]
public void IDCT8x4_LeftPart(int seed)
{
Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
var destBlock = default(Block8x8F);
var expectedDest = new float[64];
// reference
ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
// testee
FastFloatingPointDCT.IDCT8x4_LeftPart(ref srcBlock, ref destBlock);
var actualDest = new float[64];
destBlock.ScaledCopyTo(actualDest);
Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
}
[Theory] [Theory]
[InlineData(1)] [InlineData(1)]
[InlineData(2)] [InlineData(2)]
public void IDCT8x4_RightPart(int seed) [InlineData(3)]
public void LLM_TransformIDCT_CompareToAccurate(int seed)
{ {
Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed); float[] sourceArray = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
var destBlock = default(Block8x8F); var srcBlock = Block8x8F.Load(sourceArray);
var expectedDest = new float[64];
// reference // reference
ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock);
// testee // testee
FastFloatingPointDCT.IDCT8x4_RightPart(ref srcBlock, ref destBlock); // Part of the IDCT calculations is fused into the quantization step
// We must multiply input block with adjusted no-quantization matrix
var actualDest = new float[64]; // before applying IDCT
destBlock.ScaledCopyTo(actualDest); // Dequantization using unit matrix - no values are upscaled
Block8x8F dequantMatrix = CreateBlockFromScalar(1);
Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
}
[Theory]
[InlineData(1)]
[InlineData(2)]
public void IDCT8x8_Avx(int seed)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (!Avx.IsSupported)
{
this.Output.WriteLine("No AVX present, skipping test!");
return;
}
Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
Block8x8F srcBlock = default;
srcBlock.LoadFrom(src);
Block8x8F destBlock = default; // This step is needed to apply adjusting multipliers to the input block
FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
float[] expectedDest = new float[64]; // IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace();
srcBlock.MultiplyInPlace(ref dequantMatrix);
// reference, left part // IDCT calculation
ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest); FastFloatingPointDCT.TransformIDCT(ref srcBlock);
// reference, right part this.CompareBlocks(expected, srcBlock, 1f);
ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
// testee, whole 8x8
FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock);
float[] actualDest = new float[64];
destBlock.ScaledCopyTo(actualDest);
Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
#endif
} }
// Inverse transform
// This test covers entire IDCT conversion chain
// This test checks all hardware implementations
[Theory] [Theory]
[InlineData(1)] [InlineData(1)]
[InlineData(2)] [InlineData(2)]
@ -157,41 +113,53 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
{ {
int seed = FeatureTestRunner.Deserialize<int>(serialized); int seed = FeatureTestRunner.Deserialize<int>(serialized);
Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed); Span<float> src = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
var srcBlock = default(Block8x8F); var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src); srcBlock.LoadFrom(src);
var expectedDest = new float[64]; float[] expectedDest = new float[64];
var temp1 = new float[64]; float[] temp = new float[64];
var temp2 = default(Block8x8F);
// reference // reference
ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1); ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp);
// testee // testee
FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2); // Part of the IDCT calculations is fused into the quantization step
// We must multiply input block with adjusted no-quantization matrix
// before applying IDCT
Block8x8F dequantMatrix = CreateBlockFromScalar(1);
// Dequantization using unit matrix - no values are upscaled
// as quant matrix is all 1's
// This step is needed to apply adjusting multipliers to the input block
FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
srcBlock.MultiplyInPlace(ref dequantMatrix);
// IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace();
var actualDest = new float[64]; // IDCT calculation
srcBlock.ScaledCopyTo(actualDest); FastFloatingPointDCT.TransformIDCT(ref srcBlock);
float[] actualDest = srcBlock.ToArray();
Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
} }
// 3 paths: // 4 paths:
// 1. AllowAll - call avx/fma implementation // 1. AllowAll - call avx/fma implementation
// 2. DisableFMA - call avx implementation without fma acceleration // 2. DisableFMA - call avx without fma implementation
// 3. DisableAvx - call fallback code of Vector4 implementation // 3. DisableAvx - call sse Vector4 implementation
// // 4. DisableHWIntrinsic - call scalar fallback implementation
// DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
FeatureTestRunner.RunWithHwIntrinsicsFeature( FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest, RunTest,
seed, seed,
HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX); HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
} }
// Forward transform // Forward transform
// This test covers entire FDCT conversions chain // This test covers entire FDCT conversion chain
// This test checks all implementations: intrinsic and scalar fallback // This test checks all hardware implementations
[Theory] [Theory]
[InlineData(1)] [InlineData(1)]
[InlineData(2)] [InlineData(2)]
@ -201,7 +169,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
{ {
int seed = FeatureTestRunner.Deserialize<int>(serialized); int seed = FeatureTestRunner.Deserialize<int>(serialized);
Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed); Span<float> src = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
var block = default(Block8x8F); var block = default(Block8x8F);
block.LoadFrom(src); block.LoadFrom(src);
@ -212,23 +180,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true); ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
// testee // testee
// Part of the FDCT calculations is fused into the quantization step
// We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen
FastFloatingPointDCT.TransformFDCT(ref block); FastFloatingPointDCT.TransformFDCT(ref block);
for (int i = 0; i < 64; i++)
{ // Part of the IDCT calculations is fused into the quantization step
block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i]; // We must multiply input block with adjusted no-quantization matrix
} // after applying FDCT
Block8x8F quantMatrix = CreateBlockFromScalar(1);
FastFloatingPointDCT.AdjustToFDCT(ref quantMatrix);
block.MultiplyInPlace(ref quantMatrix);
float[] actualDest = block.ToArray(); float[] actualDest = block.ToArray();
Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f)); Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f));
} }
// 3 paths: // 4 paths:
// 1. AllowAll - call avx/fma implementation // 1. AllowAll - call avx/fma implementation
// 2. DisableFMA - call avx implementation without fma acceleration // 2. DisableFMA - call avx without fma implementation
// 3. DisableAvx - call sse implementation // 3. DisableAvx - call sse Vector4 implementation
// 4. DisableHWIntrinsic - call scalar fallback implementation // 4. DisableHWIntrinsic - call scalar fallback implementation
FeatureTestRunner.RunWithHwIntrinsicsFeature( FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest, RunTest,

2
tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs

@ -172,7 +172,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
bool failed = false; bool failed = false;
for (int i = 0; i < 64; i++) for (int i = 0; i < Block8x8F.Size; i++)
{ {
float expected = a[i]; float expected = a[i];
float actual = b[i]; float actual = b[i];

15
tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs

@ -48,6 +48,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
public short MaxVal { get; private set; } = short.MinValue; public short MaxVal { get; private set; } = short.MinValue;
internal void MakeBlock(Block8x8 block, int y, int x)
{
block.TransposeInplace();
this.MakeBlock(block.ToArray(), y, x);
}
internal void MakeBlock(short[] data, int y, int x) internal void MakeBlock(short[] data, int y, int x)
{ {
this.MinVal = Math.Min(this.MinVal, data.Min()); this.MinVal = Math.Min(this.MinVal, data.Min());
@ -66,11 +72,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
Span<Block8x8> blockRow = data.GetRowSpan(y - startIndex); Span<Block8x8> blockRow = data.GetRowSpan(y - startIndex);
for (int x = 0; x < this.WidthInBlocks; x++) for (int x = 0; x < this.WidthInBlocks; x++)
{ {
short[] block = blockRow[x].ToArray(); this.MakeBlock(blockRow[x], y, x);
// x coordinate stays the same - we load entire stride
// y coordinate is tricky as we load single stride to full buffer - offset is needed
this.MakeBlock(block, y, x);
} }
} }
} }
@ -83,8 +85,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
Span<Block8x8> blockRow = data.GetRowSpan(y); Span<Block8x8> blockRow = data.GetRowSpan(y);
for (int x = 0; x < this.WidthInBlocks; x++) for (int x = 0; x < this.WidthInBlocks; x++)
{ {
short[] block = blockRow[x].ToArray(); this.MakeBlock(blockRow[x], y, x);
this.MakeBlock(block, y, x);
} }
} }
} }

17
tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs

@ -40,6 +40,23 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
} }
} }
/// <summary>
/// Transpose 8x8 block stored linearly in a <see cref="Span{T}"/> (inplace)
/// </summary>
internal static void Transpose8x8(Span<short> data)
{
for (int i = 1; i < 8; i++)
{
int i8 = i * 8;
for (int j = 0; j < i; j++)
{
short tmp = data[i8 + j];
data[i8 + j] = data[(j * 8) + i];
data[(j * 8) + i] = tmp;
}
}
}
/// <summary> /// <summary>
/// Transpose 8x8 block stored linearly in a <see cref="Span{T}"/> /// Transpose 8x8 block stored linearly in a <see cref="Span{T}"/>
/// </summary> /// </summary>

14
tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs

@ -1,6 +1,7 @@
// Copyright (c) Six Labors. // Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0. // Licensed under the Apache License, Version 2.0.
using System;
using SixLabors.ImageSharp.Formats.Jpeg.Components; using SixLabors.ImageSharp.Formats.Jpeg.Components;
using Xunit; using Xunit;
@ -9,8 +10,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
[Trait("Format", "Jpg")] [Trait("Format", "Jpg")]
public class ZigZagTests public class ZigZagTests
{ {
[Fact] private static void CanHandleAllPossibleCoefficients(ReadOnlySpan<byte> order)
public void ZigZagCanHandleAllPossibleCoefficients()
{ {
// Mimic the behaviour of the huffman scan decoder using all possible byte values // Mimic the behaviour of the huffman scan decoder using all possible byte values
short[] block = new short[64]; short[] block = new short[64];
@ -26,7 +26,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
if (s != 0) if (s != 0)
{ {
i += r; i += r;
block[ZigZag.ZigZagOrder[i++]] = (short)s; block[order[i++]] = (short)s;
} }
else else
{ {
@ -40,5 +40,13 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
} }
} }
} }
[Fact]
public static void ZigZagCanHandleAllPossibleCoefficients() =>
CanHandleAllPossibleCoefficients(ZigZag.ZigZagOrder);
[Fact]
public static void TrasposingZigZagCanHandleAllPossibleCoefficients() =>
CanHandleAllPossibleCoefficients(ZigZag.TransposingOrder);
} }
} }

Loading…
Cancel
Save