Browse Source

Merge branch 'master' into bp/webpalpha

pull/1971/head
Brian Popow 4 years ago
committed by GitHub
parent
commit
484bd77d8d
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 8
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
  2. 5
      src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
  3. 133
      src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
  4. 429
      src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
  5. 2
      tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
  6. 6
      tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
  7. 5
      tests/ImageSharp.Tests/Memory/Allocators/UniformUnmanagedMemoryPoolTests.cs

8
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs

@ -280,7 +280,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
}
/// <summary>
/// Quantize input block, apply zig-zag ordering and store result as 16bit integers.
/// Quantize input block, transpose, apply zig-zag ordering and store as <see cref="Block8x8"/>.
/// </summary>
/// <param name="block">Source block.</param>
/// <param name="dest">Destination block.</param>
@ -291,19 +291,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
if (Avx2.IsSupported)
{
MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
ZigZag.ApplyZigZagOrderingAvx2(ref dest);
ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest);
}
else if (Ssse3.IsSupported)
{
MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
ZigZag.ApplyZigZagOrderingSsse3(ref dest);
ZigZag.ApplyTransposingZigZagOrderingSsse3(ref dest);
}
else
#endif
{
for (int i = 0; i < Size; i++)
{
int idx = ZigZag.ZigZagOrder[i];
int idx = ZigZag.TransposingOrder[i];
float quantizedVal = block[idx] * qt[idx];
quantizedVal += quantizedVal < 0 ? -0.5f : 0.5f;
dest[i] = (short)quantizedVal;

5
src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs

@ -29,11 +29,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
// First pass - process rows
block.TransposeInplace();
// First pass - process columns
FDCT8x8_1D_Avx(ref block);
// Second pass - process columns
// Second pass - process rows
block.TransposeInplace();
FDCT8x8_1D_Avx(ref block);

133
src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs

@ -92,6 +92,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
tableRef = 0.125f / (tableRef * Unsafe.Add(ref multipliersRef, i));
tableRef = ref Unsafe.Add(ref tableRef, 1);
}
// Spectral macroblocks are not transposed before quantization
// Transpose is done after quantization at zig-zag stage
// so we must transpose quantization table
quantTable.TransposeInplace();
}
/// <summary>
@ -133,14 +138,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
}
else
#endif
if (Vector.IsHardwareAccelerated)
{
FDCT_Vector4(ref block);
}
else
{
FDCT_Scalar(ref block);
}
}
/// <summary>
@ -217,136 +217,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
}
}
/// <summary>
/// Apply 2D floating point FDCT inplace using scalar operations.
/// </summary>
/// <remarks>
/// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
/// </remarks>
/// <param name="block">Input block.</param>
private static void FDCT_Scalar(ref Block8x8F block)
{
const int dctSize = 8;
float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
float tmp10, tmp11, tmp12, tmp13;
float z1, z2, z3, z4, z5, z11, z13;
// First pass - process rows
ref float blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
for (int ctr = 7; ctr >= 0; ctr--)
{
tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7);
tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7);
tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6);
tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6);
tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5);
tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5);
tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4);
tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4);
// Even part
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11;
z1 = (tmp12 + tmp13) * 0.707106781f;
Unsafe.Add(ref blockRef, 2) = tmp13 + z1;
Unsafe.Add(ref blockRef, 6) = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
z5 = (tmp10 - tmp12) * 0.382683433f;
z2 = (0.541196100f * tmp10) + z5;
z4 = (1.306562965f * tmp12) + z5;
z3 = tmp11 * 0.707106781f;
z11 = tmp7 + z3;
z13 = tmp7 - z3;
Unsafe.Add(ref blockRef, 5) = z13 + z2;
Unsafe.Add(ref blockRef, 3) = z13 - z2;
Unsafe.Add(ref blockRef, 1) = z11 + z4;
Unsafe.Add(ref blockRef, 7) = z11 - z4;
blockRef = ref Unsafe.Add(ref blockRef, dctSize);
}
// Second pass - process columns
blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
for (int ctr = 7; ctr >= 0; ctr--)
{
tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7);
tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7);
tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6);
tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6);
tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5);
tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5);
tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4);
tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4);
// Even part
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11;
Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11;
z1 = (tmp12 + tmp13) * 0.707106781f;
Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1;
Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
z5 = (tmp10 - tmp12) * 0.382683433f;
z2 = (0.541196100f * tmp10) + z5;
z4 = (1.306562965f * tmp12) + z5;
z3 = tmp11 * 0.707106781f;
z11 = tmp7 + z3;
z13 = tmp7 - z3;
Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2;
Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2;
Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4;
Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4;
blockRef = ref Unsafe.Add(ref blockRef, 1);
}
}
/// <summary>
/// Apply floating point FDCT inplace using <see cref="Vector4"/> API.
/// </summary>
/// <remarks>
/// This implementation must be called only if hardware supports 4
/// floating point numbers vector. Otherwise explicit scalar
/// implementation <see cref="FDCT_Scalar"/> is faster
/// because it does not rely on block transposition.
/// </remarks>
/// <param name="block">Input block.</param>
public static void FDCT_Vector4(ref Block8x8F block)
{
DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
// First pass - process rows
block.TransposeInplace();
// First pass - process columns
FDCT8x4_Vector4(ref block.V0L);
FDCT8x4_Vector4(ref block.V0R);
// Second pass - process columns
// Second pass - process rows
block.TransposeInplace();
FDCT8x4_Vector4(ref block.V0L);
FDCT8x4_Vector4(ref block.V0R);

429
src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs

@ -3,6 +3,7 @@
#if SUPPORTS_RUNTIME_INTRINSICS
using System;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
@ -18,120 +19,138 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
#pragma warning restore SA1309
/// <summary>
/// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSsse3"/>
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingSsse3"/>
/// zig zag implementation.
/// </summary>
private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
{
// row0
0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
_, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
_, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
// row1
_, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
_, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
// row2
_, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
_, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
// row3
_, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
_, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
_, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
// row4
_, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
_, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
_, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
// row5
_, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
// row6
_, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
_, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
// row7
10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
_, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
#pragma warning disable SA1515
/* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */
// A
0, 1, _, _, 2, 3, 4, 5, _, _, _, _, _, _, _, _,
// B
_, _, 0, 1, _, _, _, _, 2, 3, _, _, _, _, _, _,
// C
_, _, _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3,
/* row1 - B2 A3 A4 B3 C2 D1 E0 F0 */
// A
_, _, 6, 7, 8, 9, _, _, _, _, _, _, _, _, _, _,
// B
4, 5, _, _, _, _, 6, 7, _, _, _, _, _, _, _, _,
/* row2 - E1 D2 C3 B4 A5 A6 B5 C4 */
// A
_, _, _, _, _, _, _, _, 10, 11, 12, 13, _, _, _, _,
// B
_, _, _, _, _, _, 8, 9, _, _, _, _, 10, 11, _, _,
// C
_, _, _, _, 6, 7, _, _, _, _, _, _, _, _, 8, 9,
/* row3 - D3 E2 F1 G0 H0 G1 F2 E3 */
// E
_, _, 4, 5, _, _, _, _, _, _, _, _, _, _, 6, 7,
// F
_, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, _, _,
// G
_, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _,
/* row4 - D4 C5 B6 A7 B7 C6 D5 E4 */
// B
_, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _,
// C
_, _, 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _,
// D
8, 9, _, _, _, _, _, _, _, _, _, _, 10, 11, _, _,
/* row5 - F3 G2 H1 H2 G3 F4 E5 D6 */
// F
6, 7, _, _, _, _, _, _, _, _, 8, 9, _, _, _, _,
// G
_, _, 4, 5, _, _, _, _, 6, 7, _, _, _, _, _, _,
// H
_, _, _, _, 2, 3, 4, 5, _, _, _, _, _, _, _, _,
/* row6 - C7 D7 E6 F5 G4 H3 H4 G5 */
// G
_, _, _, _, _, _, _, _, 8, 9, _, _, _, _, 10, 11,
// H
_, _, _, _, _, _, _, _, _, _, 6, 7, 8, 9, _, _,
/* row7 - F6 E7 F7 G6 H5 H6 G7 H7 */
// F
12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _,
// G
_, _, _, _, _, _, 12, 13, _, _, _, _, 14, 15, _, _,
// H
_, _, _, _, _, _, _, _, 10, 11, 12, 13, _, _, 14, 15,
#pragma warning restore SA1515
};
/// <summary>
/// Gets shuffle vectors for <see cref="ApplyZigZagOrderingAvx2"/>
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingAvx2"/>
/// zig zag implementation.
/// </summary>
private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[]
{
// 01_AB/01_EF/23_CD - cross-lane
0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0,
// 01_AB - inner-lane
0, 1, 2, 3, 8, 9, _, _, 10, 11, 4, 5, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, 6, 7,
// 01_CD/23_GH - cross-lane
0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _,
// 01_CD - inner-lane
_, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _,
// 01_EF - inner-lane
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _,
// 23_AB/45_CD/67_EF - cross-lane
3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _,
// 23_AB - inner-lane
4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, 2, 3, 8, 9, _, _, _, _,
// 23_CD - inner-lane
_, _, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, 6, 7, 12, 13,
// 23_EF - inner-lane
_, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// 23_GH - inner-lane
_, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// 45_AB - inner-lane
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _,
// 45_CD - inner-lane
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _,
// 45_EF - cross-lane
1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, _, _, _, _, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0,
// 45_EF - inner-lane
2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _,
// 45_GH - inner-lane
_, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7,
// 67_CD - inner-lane
_, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// 67_EF - inner-lane
_, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _,
// 67_GH - inner-lane
8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, 6, 7, 12, 13, 14, 15
#pragma warning disable SA1515
/* 01 */
// [cr] crln_01_AB_CD
0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0,
// (in) AB
0, 1, 8, 9, 2, 3, 4, 5, 10, 11, _, _, _, _, _, _, 12, 13, 2, 3, 4, 5, 14, 15, _, _, _, _, _, _, _, _,
// (in) CD
_, _, _, _, _, _, _, _, _, _, 0, 1, 8, 9, 2, 3, _, _, _, _, _, _, _, _, 0, 1, 10, 11, _, _, _, _,
// [cr] crln_01_23_EF_23_CD
0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0,
// (in) EF
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, 8, 9,
/* 23 */
// [cr] crln_23_AB_23_45_GH
2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0,
// (in) AB
_, _, _, _, _, _, 8, 9, 2, 3, 4, 5, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// (in) CDe
_, _, 12, 13, 6, 7, _, _, _, _, _, _, _, _, 8, 9, 14, 15, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// (in) EF
2, 3, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 4, 5, 10, 11, _, _, _, _, _, _, 12, 13, 6, 7,
// (in) GH
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, 8, 9, 2, 3, _, _, _, _,
/* 45 */
// (in) AB
_, _, _, _, 12, 13, 6, 7, 14, 15, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// [cr] crln_45_67_CD_45_EF
2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0,
// (in) CD
8, 9, 2, 3, _, _, _, _, _, _, 4, 5, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 12, 13,
// (in) EF
_, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, 6, 7, _, _, _, _, _, _, _, _, 8, 9, 2, 3, _, _,
// (in) GH
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 4, 5, 10, 11, 12, 13, 6, 7, _, _, _, _, _, _,
/* 67 */
// (in) CD
6, 7, 14, 15, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// [cr] crln_67_EF_67_GH
2, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _,
// (in) EF
_, _, _, _, 4, 5, 14, 15, _, _, _, _, _, _, _, _, 8, 9, 2, 3, 10, 11, _, _, _, _, _, _, _, _, _, _,
// (in) GH
_, _, _, _, _, _, _, _, 0, 1, 10, 11, 12, 13, 2, 3, _, _, _, _, _, _, 0, 1, 6, 7, 8, 9, 2, 3, 10, 11,
#pragma warning restore SA1515
};
/// <summary>
/// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
/// </summary>
/// <param name="block">Input matrix.</param>
public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block)
public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block)
{
DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
fixed (byte* maskPtr = SseShuffleMasks)
fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks))
{
Vector128<byte> rowA = block.V0.AsByte();
Vector128<byte> rowB = block.V1.AsByte();
@ -142,73 +161,69 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
Vector128<byte> rowG = block.V6.AsByte();
Vector128<byte> rowH = block.V7.AsByte();
// row0 - A0 A1 B0 C0 B1 A2 A3 B2
Vector128<short> rowA0 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 0))).AsInt16();
Vector128<short> rowB0 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 1))).AsInt16();
Vector128<short> row0 = Sse2.Or(rowA0, rowB0);
Vector128<short> rowC0 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 2))).AsInt16();
row0 = Sse2.Or(row0, rowC0);
// row1 - C1 D0 E0 D1 C2 B3 A4 A5
Vector128<short> rowA1 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 3))).AsInt16();
Vector128<short> rowC1 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 4))).AsInt16();
Vector128<short> row1 = Sse2.Or(rowA1, rowC1);
Vector128<short> rowD1 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 5))).AsInt16();
row1 = Sse2.Or(row1, rowD1);
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 3), 5).AsInt16();
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 2).AsInt16();
// row2
Vector128<short> rowE2 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 6))).AsInt16();
Vector128<short> rowF2 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 7))).AsInt16();
Vector128<short> row2 = Sse2.Or(rowE2, rowF2);
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 4), 0).AsInt16();
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 3), 1).AsInt16();
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 2).AsInt16();
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 0), 5).AsInt16();
// row3
Vector128<short> rowA3 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 8))).AsInt16().AsInt16();
Vector128<short> rowB3 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 9))).AsInt16().AsInt16();
Vector128<short> row3 = Sse2.Or(rowA3, rowB3);
Vector128<short> rowC3 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 10))).AsInt16();
row3 = Sse2.Or(row3, rowC3);
Vector128<byte> shuffleRowD3EF = Sse2.LoadVector128(maskPtr + (16 * 11));
Vector128<short> rowD3 = Ssse3.Shuffle(rowD, shuffleRowD3EF).AsInt16();
row3 = Sse2.Or(row3, rowD3);
// row4
Vector128<short> rowE4 = Ssse3.Shuffle(rowE, shuffleRowD3EF).AsInt16();
Vector128<short> rowF4 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 12))).AsInt16();
Vector128<short> row4 = Sse2.Or(rowE4, rowF4);
Vector128<short> rowG4 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 13))).AsInt16();
row4 = Sse2.Or(row4, rowG4);
Vector128<short> rowH4 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 14))).AsInt16();
row4 = Sse2.Or(row4, rowH4);
// row5
Vector128<short> rowC5 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 15))).AsInt16();
Vector128<short> rowD5 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16();
Vector128<short> row5 = Sse2.Or(rowC5, rowD5);
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 7), 2).AsInt16();
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 5).AsInt16();
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 4), 6).AsInt16();
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 3), 7).AsInt16();
// row6
Vector128<short> rowE6 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 17))).AsInt16();
Vector128<short> rowF6 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 18))).AsInt16();
Vector128<short> row6 = Sse2.Or(rowE6, rowF6);
Vector128<short> rowH6 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 19))).AsInt16();
row6 = Sse2.Or(row6, rowH6);
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 5).AsInt16();
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 4), 2).AsInt16();
// row7
Vector128<short> rowG7 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 20))).AsInt16();
Vector128<short> rowH7 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 21))).AsInt16();
Vector128<short> row7 = Sse2.Or(rowG7, rowH7);
row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 7), 4).AsInt16();
// row0 - A0 B0 A1 A2 B1 C0 D0 C1
Vector128<short> row0_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 0))).AsInt16();
Vector128<short> row0_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 1))).AsInt16();
Vector128<short> row0_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 2))).AsInt16();
Vector128<short> row0 = Sse2.Or(Sse2.Or(row0_A, row0_B), row0_C);
row0 = Sse2.Insert(row0.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 0), 6).AsInt16();
// row1 - B2 A3 A4 B3 C2 D1 E0 F0
Vector128<short> row1_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 3))).AsInt16();
Vector128<short> row1_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 4))).AsInt16();
Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 2), 4).AsInt16();
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 1), 5).AsInt16();
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 6).AsInt16();
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 0), 7).AsInt16();
// row2 - E1 D2 C3 B4 A5 A6 B5 C4
Vector128<short> row2_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 5))).AsInt16();
Vector128<short> row2_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 6))).AsInt16();
Vector128<short> row2_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 7))).AsInt16();
Vector128<short> row2 = Sse2.Or(Sse2.Or(row2_A, row2_B), row2_C);
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 1).AsInt16();
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 1), 0).AsInt16();
// row3 - D3 E2 F1 G0 H0 G1 F2 E3
Vector128<short> row3_E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 8))).AsInt16();
Vector128<short> row3_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 9))).AsInt16();
Vector128<short> row3_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 10))).AsInt16();
Vector128<short> row3 = Sse2.Or(Sse2.Or(row3_E, row3_F), row3_G);
row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 3), 0).AsInt16();
row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowH.AsUInt16(), 0), 4).AsInt16();
// row4 - D4 C5 B6 A7 B7 C6 D5 E4
Vector128<short> row4_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 11))).AsInt16();
Vector128<short> row4_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 12))).AsInt16();
Vector128<short> row4_D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 13))).AsInt16();
Vector128<short> row4 = Sse2.Or(Sse2.Or(row4_B, row4_C), row4_D);
row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowA.AsUInt16(), 7), 3).AsInt16();
row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 4), 7).AsInt16();
// row5 - F3 G2 H1 H2 G3 F4 E5 D6
Vector128<short> row5_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 14))).AsInt16();
Vector128<short> row5_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 15))).AsInt16();
Vector128<short> row5_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 16))).AsInt16();
Vector128<short> row5 = Sse2.Or(Sse2.Or(row5_F, row5_G), row5_H);
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 6), 7).AsInt16();
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 6).AsInt16();
// row6 - C7 D7 E6 F5 G4 H3 H4 G5
Vector128<short> row6_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 17))).AsInt16();
Vector128<short> row6_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 18))).AsInt16();
Vector128<short> row6 = Sse2.Or(row6_G, row6_H);
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 7), 0).AsInt16();
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 1).AsInt16();
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 6), 2).AsInt16();
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 5), 3).AsInt16();
// row7 - F6 E7 F7 G6 H5 H6 G7 H7
Vector128<short> row7_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 19))).AsInt16();
Vector128<short> row7_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 20))).AsInt16();
Vector128<short> row7_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 21))).AsInt16();
Vector128<short> row7 = Sse2.Or(Sse2.Or(row7_F, row7_G), row7_H);
row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 7), 1).AsInt16();
block.V0 = row0;
block.V1 = row1;
@ -225,69 +240,61 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
/// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics.
/// </summary>
/// <param name="block">Input matrix.</param>
public static unsafe void ApplyZigZagOrderingAvx2(ref Block8x8 block)
public static unsafe void ApplyTransposingZigZagOrderingAvx2(ref Block8x8 block)
{
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
fixed (byte* shuffleVectorsPtr = AvxShuffleMasks)
fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(AvxShuffleMasks))
{
Vector256<byte> rowsAB = block.V01.AsByte();
Vector256<byte> rowsCD = block.V23.AsByte();
Vector256<byte> rowsEF = block.V45.AsByte();
Vector256<byte> rowsGH = block.V67.AsByte();
// rows 0 1
Vector256<int> rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
Vector256<byte> row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
Vector256<byte> rowAB = block.V01.AsByte();
Vector256<byte> rowCD = block.V23.AsByte();
Vector256<byte> rowEF = block.V45.AsByte();
Vector256<byte> rowGH = block.V67.AsByte();
/* row01 - A0 B0 A1 A2 B1 C0 D0 C1 | B2 A3 A4 B3 C2 D1 E0 F0 */
Vector256<int> crln_01_AB_CD = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32();
Vector256<byte> row01_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_01_AB_CD).AsByte();
row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte();
Vector256<int> rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32();
Vector256<byte> row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte();
Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
Vector256<byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
Vector256<byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF);
// rows 2 3
Vector256<int> rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
Vector256<byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
Vector256<byte> row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte();
Vector256<byte> row01_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_AB_CD).AsByte();
row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (2 * 32))).AsByte();
Vector256<int> crln_01_23_EF_23_CD = Avx.LoadVector256(shuffleVectorsPtr + (3 * 32)).AsInt32();
Vector256<byte> row01_23_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_01_23_EF_23_CD).AsByte();
Vector256<byte> row01_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte();
Vector256<byte> row01 = Avx2.Or(row01_AB, Avx2.Or(row01_CD, row01_EF));
/* row23 - E1 D2 C3 B4 A5 A6 B5 C4 | D3 E2 F1 G0 H0 G1 F2 E3 */
Vector256<int> crln_23_AB_23_45_GH = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32();
Vector256<byte> row23_45_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_23_AB_23_45_GH).AsByte();
Vector256<byte> row23_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte();
Vector256<byte> row23_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_23_EF_23_CD).AsByte();
row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte();
Vector256<byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte();
Vector256<byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte());
Vector256<byte> row23_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte();
Vector256<byte> row23_45_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_23_AB_23_45_GH).AsByte();
Vector256<byte> row23_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32))).AsByte();
Vector256<byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH));
// rows 4 5
Vector256<byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte());
Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
Vector256<byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte());
Vector256<int> rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32();
Vector256<byte> row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte());
Vector256<byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte());
/* row45 - D4 C5 B6 A7 B7 C6 D5 E4 | F3 G2 H1 H2 G3 F4 E5 D6 */
Vector256<byte> row45_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32))).AsByte();
Vector256<int> crln_45_67_CD_45_EF = Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsInt32();
Vector256<byte> row45_67_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_45_67_CD_45_EF).AsByte();
Vector256<byte> row45_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (12 * 32))).AsByte();
Vector256<byte> row45_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_45_67_CD_45_EF).AsByte();
row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32))).AsByte();
Vector256<byte> row45_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32))).AsByte();
Vector256<byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH));
// rows 6 7
Vector256<byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte());
Vector256<byte> row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte();
row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte());
Vector256<byte> row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte();
row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte());
/* row67 - C7 D7 E6 F5 G4 H3 H4 G5 | F6 E7 F7 G6 H5 H6 G7 H7 */
Vector256<byte> row67_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32))).AsByte();
Vector256<int> crln_67_EF_67_GH = Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsInt32();
Vector256<byte> row67_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_67_EF_67_GH).AsByte();
row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32))).AsByte();
Vector256<byte> row67_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_67_EF_67_GH).AsByte();
row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (18 * 32))).AsByte();
Vector256<byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH);
Vector256<byte> row67 = Avx2.Or(row67_CD, Avx2.Or(row67_EF, row67_GH));
block.V01 = row01.AsInt16();
block.V23 = row23.AsInt16();

2
tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs

@ -220,7 +220,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
// Reference implementation quantizes given block via division
Block8x8 expected = default;
ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);
ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.TransposingOrder);
// Actual current implementation quantizes given block via multiplication
// With quantization table reciprocal

6
tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs

@ -135,10 +135,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
srcBlock.MultiplyInPlace(ref dequantMatrix);
// testee
// IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace();
// IDCT calculation
FastFloatingPointDCT.TransformIDCT(ref srcBlock);
float[] actualDest = srcBlock.ToArray();
@ -180,7 +179,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
// testee
// Second transpose call is done by Quantize step
// Do this manually here just to be complient to the reference implementation
FastFloatingPointDCT.TransformFDCT(ref block);
block.TransposeInplace();
// Part of the IDCT calculations is fused into the quantization step
// We must multiply input block with adjusted no-quantization matrix

5
tests/ImageSharp.Tests/Memory/Allocators/UniformUnmanagedMemoryPoolTests.cs

@ -245,7 +245,10 @@ namespace SixLabors.ImageSharp.Tests.Memory.Allocators
cleanup.Register(b1);
}
[Theory]
public static readonly bool IsNotMacOS = !TestEnvironment.IsOSX;
// TODO: Investigate MacOS failures
[ConditionalTheory(nameof(IsNotMacOS))]
[InlineData(false)]
[InlineData(true)]
public void RentReturnRelease_SubsequentRentReturnsDifferentHandles(bool multiple)

Loading…
Cancel
Save