Browse Source

New FDCT method, reciprocal quantization

pull/1761/head
Dmitry Pentin 5 years ago
parent
commit
2f143bf9d3
  1. 81
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
  2. 209
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
  3. 2
      src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
  4. 34
      src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
  5. 210
      src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
  6. 400
      src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
  7. 108
      src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
  8. 8
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
  9. 50
      tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
  10. 149
      tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs

81
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs

@ -3,6 +3,7 @@
#if SUPPORTS_RUNTIME_INTRINSICS
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
@ -38,7 +39,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
0, 1, 4, 5, 2, 3, 6, 7
};
private static unsafe void DivideIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
@ -53,8 +54,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
for (int i = 0; i < 8; i += 2)
{
Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16();
@ -64,7 +65,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
}
}
private static void DivideIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
@ -75,13 +76,81 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
for (int i = 0; i < 16; i += 2)
{
Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector128<short> row = Sse2.PackSignedSaturate(left, right);
Unsafe.Add(ref destBase, i / 2) = row;
}
}
private void TransposeAvx()
{
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
Vector256<float> r0 = Avx.InsertVector128(
this.V0,
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
1);
Vector256<float> r1 = Avx.InsertVector128(
this.V1,
Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
1);
Vector256<float> r2 = Avx.InsertVector128(
this.V2,
Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
1);
Vector256<float> r3 = Avx.InsertVector128(
this.V3,
Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
1);
Vector256<float> r4 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
1);
Vector256<float> r5 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
1);
Vector256<float> r6 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
1);
Vector256<float> r7 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
1);
Vector256<float> t0 = Avx.UnpackLow(r0, r1);
Vector256<float> t2 = Avx.UnpackLow(r2, r3);
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
this.V0 = Avx.Blend(t0, v, 0xCC);
this.V1 = Avx.Blend(t2, v, 0x33);
Vector256<float> t4 = Avx.UnpackLow(r4, r5);
Vector256<float> t6 = Avx.UnpackLow(r6, r7);
v = Avx.Shuffle(t4, t6, 0x4E);
this.V4 = Avx.Blend(t4, v, 0xCC);
this.V5 = Avx.Blend(t6, v, 0x33);
Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
v = Avx.Shuffle(t1, t3, 0x4E);
this.V2 = Avx.Blend(t1, v, 0xCC);
this.V3 = Avx.Blend(t3, v, 0x33);
Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
v = Avx.Shuffle(t5, t7, 0x4E);
this.V6 = Avx.Blend(t5, v, 0xCC);
this.V7 = Avx.Blend(t7, v, 0x33);
}
}
}
#endif

209
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs

@ -413,41 +413,41 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported)
{
DivideIntoInt16_Avx2(ref block, ref qt, ref dest);
MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
ZigZag.ApplyZigZagOrderingAvx(ref dest, ref dest);
}
else if (Ssse3.IsSupported)
{
DivideIntoInt16_Sse2(ref block, ref qt, ref dest);
MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
ZigZag.ApplyZigZagOrderingSse(ref dest, ref dest);
}
else
#endif
{
Divide(ref block, ref qt);
Multiply(ref block, ref qt);
block.RoundInto(ref dest);
}
}
[MethodImpl(InliningOptions.ShortMethod)]
private static void Divide(ref Block8x8F a, ref Block8x8F b)
{
a.V0L /= b.V0L;
a.V0R /= b.V0R;
a.V1L /= b.V1L;
a.V1R /= b.V1R;
a.V2L /= b.V2L;
a.V2R /= b.V2R;
a.V3L /= b.V3L;
a.V3R /= b.V3R;
a.V4L /= b.V4L;
a.V4R /= b.V4R;
a.V5L /= b.V5L;
a.V5R /= b.V5R;
a.V6L /= b.V6L;
a.V6R /= b.V6R;
a.V7L /= b.V7L;
a.V7R /= b.V7R;
private static void Multiply(ref Block8x8F a, ref Block8x8F b)
{
a.V0L *= b.V0L;
a.V0R *= b.V0R;
a.V1L *= b.V1L;
a.V1R *= b.V1R;
a.V2L *= b.V2L;
a.V2R *= b.V2R;
a.V3L *= b.V3L;
a.V3R *= b.V3R;
a.V4L *= b.V4L;
a.V4R *= b.V4R;
a.V5L *= b.V5L;
a.V5R *= b.V5R;
a.V6L *= b.V6L;
a.V6R *= b.V6R;
a.V7L *= b.V7L;
a.V7R *= b.V7R;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@ -608,154 +608,45 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
}
/// <summary>
/// Transpose the block into the destination block.
/// Transpose the block inplace.
/// </summary>
/// <param name="d">The destination block</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void TransposeInto(ref Block8x8F d)
public void Transpose()
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
Vector256<float> r0 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V0L).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
1);
Vector256<float> r1 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V1L).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
1);
Vector256<float> r2 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V2L).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
1);
Vector256<float> r3 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V3L).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
1);
Vector256<float> r4 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
1);
Vector256<float> r5 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
1);
Vector256<float> r6 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
1);
Vector256<float> r7 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
1);
Vector256<float> t0 = Avx.UnpackLow(r0, r1);
Vector256<float> t2 = Avx.UnpackLow(r2, r3);
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
d.V0 = Avx.Blend(t0, v, 0xCC);
d.V1 = Avx.Blend(t2, v, 0x33);
Vector256<float> t4 = Avx.UnpackLow(r4, r5);
Vector256<float> t6 = Avx.UnpackLow(r6, r7);
v = Avx.Shuffle(t4, t6, 0x4E);
d.V4 = Avx.Blend(t4, v, 0xCC);
d.V5 = Avx.Blend(t6, v, 0x33);
Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
v = Avx.Shuffle(t1, t3, 0x4E);
d.V2 = Avx.Blend(t1, v, 0xCC);
d.V3 = Avx.Blend(t3, v, 0x33);
Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
v = Avx.Shuffle(t5, t7, 0x4E);
d.V6 = Avx.Blend(t5, v, 0xCC);
d.V7 = Avx.Blend(t7, v, 0x33);
this.TransposeAvx();
}
else
#endif
{
d.V0L.X = this.V0L.X;
d.V1L.X = this.V0L.Y;
d.V2L.X = this.V0L.Z;
d.V3L.X = this.V0L.W;
d.V4L.X = this.V0R.X;
d.V5L.X = this.V0R.Y;
d.V6L.X = this.V0R.Z;
d.V7L.X = this.V0R.W;
d.V0L.Y = this.V1L.X;
d.V1L.Y = this.V1L.Y;
d.V2L.Y = this.V1L.Z;
d.V3L.Y = this.V1L.W;
d.V4L.Y = this.V1R.X;
d.V5L.Y = this.V1R.Y;
d.V6L.Y = this.V1R.Z;
d.V7L.Y = this.V1R.W;
d.V0L.Z = this.V2L.X;
d.V1L.Z = this.V2L.Y;
d.V2L.Z = this.V2L.Z;
d.V3L.Z = this.V2L.W;
d.V4L.Z = this.V2R.X;
d.V5L.Z = this.V2R.Y;
d.V6L.Z = this.V2R.Z;
d.V7L.Z = this.V2R.W;
d.V0L.W = this.V3L.X;
d.V1L.W = this.V3L.Y;
d.V2L.W = this.V3L.Z;
d.V3L.W = this.V3L.W;
d.V4L.W = this.V3R.X;
d.V5L.W = this.V3R.Y;
d.V6L.W = this.V3R.Z;
d.V7L.W = this.V3R.W;
d.V0R.X = this.V4L.X;
d.V1R.X = this.V4L.Y;
d.V2R.X = this.V4L.Z;
d.V3R.X = this.V4L.W;
d.V4R.X = this.V4R.X;
d.V5R.X = this.V4R.Y;
d.V6R.X = this.V4R.Z;
d.V7R.X = this.V4R.W;
d.V0R.Y = this.V5L.X;
d.V1R.Y = this.V5L.Y;
d.V2R.Y = this.V5L.Z;
d.V3R.Y = this.V5L.W;
d.V4R.Y = this.V5R.X;
d.V5R.Y = this.V5R.Y;
d.V6R.Y = this.V5R.Z;
d.V7R.Y = this.V5R.W;
d.V0R.Z = this.V6L.X;
d.V1R.Z = this.V6L.Y;
d.V2R.Z = this.V6L.Z;
d.V3R.Z = this.V6L.W;
d.V4R.Z = this.V6R.X;
d.V5R.Z = this.V6R.Y;
d.V6R.Z = this.V6R.Z;
d.V7R.Z = this.V6R.W;
d.V0R.W = this.V7L.X;
d.V1R.W = this.V7L.Y;
d.V2R.W = this.V7L.Z;
d.V3R.W = this.V7L.W;
d.V4R.W = this.V7R.X;
d.V5R.W = this.V7R.Y;
d.V6R.W = this.V7R.Z;
d.V7R.W = this.V7R.W;
this.TransposeScalar();
}
}
/// <summary>
/// Scalar inplace transpose implementation for <see cref="Transpose"/>
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
private void TransposeScalar()
{
float tmp;
int horIndex, verIndex;
// We don't care about the last row as it consists of a single element
// Which won't be swapped with anything
for (int i = 0; i < 7; i++)
{
// We don't care about the first element in each row as it's not swapped
for (int j = i + 1; j < 8; j++)
{
horIndex = (i * 8) + j;
verIndex = (j * 8) + i;
tmp = this[horIndex];
this[horIndex] = this[verIndex];
this[verIndex] = tmp;
}
}
}

2
src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs

@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
// Dequantize:
block.MultiplyInPlace(ref this.DequantiazationTable);
FastFloatingPointDCT.TransformInplaceIDCT(ref block, ref this.WorkspaceBlock);
FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock);
// To conform better to libjpeg we actually NEED TO loose precision here.
// This is because they store blocks as Int16 between all the operations.

34
src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs

@ -94,8 +94,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
/// </summary>
private int bitCount;
private Block8x8F temporalBlock;
private Block8x8 temporalShortBlock;
private Block8x8 tempBlock;
/// <summary>
/// The output stream. All attempted writes after the first error become no-ops.
@ -130,6 +129,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
public void Encode444<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
where TPixel : unmanaged, IPixel<TPixel>
{
// Calculate reciprocal quantization tables for FDCT method
for (int i = 0; i < 64; i++)
{
luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i];
}
this.huffmanTables = HuffmanLut.TheHuffmanLut;
// ReSharper disable once InconsistentNaming
@ -190,6 +196,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
where TPixel : unmanaged, IPixel<TPixel>
{
// Calculate reciprocal quantization tables for FDCT method
for (int i = 0; i < 64; i++)
{
luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i];
}
this.huffmanTables = HuffmanLut.TheHuffmanLut;
// ReSharper disable once InconsistentNaming
@ -256,6 +269,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
where TPixel : unmanaged, IPixel<TPixel>
{
// Calculate reciprocal quantization tables for FDCT method
for (int i = 0; i < 64; i++)
{
luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
}
this.huffmanTables = HuffmanLut.TheHuffmanLut;
// ReSharper disable once InconsistentNaming
@ -301,6 +320,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
public void EncodeRgb<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
where TPixel : unmanaged, IPixel<TPixel>
{
// Calculate reciprocal quantization tables for FDCT method
for (int i = 0; i < 64; i++)
{
luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i];
}
this.huffmanTables = HuffmanLut.TheHuffmanLut;
// ReSharper disable once InconsistentNaming
@ -365,14 +390,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
ref Block8x8F block,
ref Block8x8F quant)
{
ref Block8x8F refTemp = ref this.temporalBlock;
ref Block8x8 spectralBlock = ref this.temporalShortBlock;
ref Block8x8 spectralBlock = ref this.tempBlock;
// Shifting level from 0..255 to -128..127
block.AddInPlace(-128f);
// Discrete cosine transform
FastFloatingPointDCT.TransformInplaceFDCT(ref block, ref refTemp);
FastFloatingPointDCT.TransformFDCT(ref block);
// Quantization
Block8x8F.Quantize(ref block, ref spectralBlock, ref quant);

210
src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs

@ -0,0 +1,210 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
#if SUPPORTS_RUNTIME_INTRINSICS
using System;
using System.Collections.Generic;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Text;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
internal static partial class FastFloatingPointDCT
{
/// <summary>
/// Gets reciprocal coefficients for jpeg quantization tables calculation.
/// </summary>
/// <remarks>
/// <para>
/// Current FDCT implementation expects its results to be multiplied by
/// a reciprocal quantization table. Values in this table must be divided
/// by quantization table values scaled with quality settings.
/// </para>
/// <para>
/// These values were calculates with this formula:
/// <code>
/// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
/// </code>
/// Where:
/// <code>
/// scalefactor[0] = 1
/// </code>
/// <code>
/// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
/// </code>
/// Values are also scaled by 8 so DCT code won't do unnecessary division.
/// </para>
/// </remarks>
public static ReadOnlySpan<float> DctReciprocalAdjustmentCoefficients => new float[]
{
0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
};
#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f);
private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);
private static readonly Vector128<float> mm128_F_0_7071 = Vector128.Create(0.707106781f);
private static readonly Vector128<float> mm128_F_0_3826 = Vector128.Create(0.382683433f);
private static readonly Vector128<float> mm128_F_0_5411 = Vector128.Create(0.541196100f);
private static readonly Vector128<float> mm128_F_1_3065 = Vector128.Create(1.306562965f);
#pragma warning restore SA1310, SA1311, IDE1006
/// <summary>
/// Apply floating point FDCT inplace using simd operations.
/// </summary>
/// <param name="block">Input matrix.</param>
private static void ForwardTransformSimd(ref Block8x8F block)
{
DebugGuard.IsTrue(Avx.IsSupported || Sse.IsSupported, "Avx or at least Sse support is required to execute this operation.");
// First pass - process rows
block.Transpose();
if (Avx.IsSupported)
{
FDCT8x8_avx(ref block);
}
else if (Sse.IsSupported)
{
// Left part
FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
// Right part
FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
}
// Second pass - process columns
block.Transpose();
if (Avx.IsSupported)
{
FDCT8x8_avx(ref block);
}
else if (Sse.IsSupported)
{
// Left part
FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0L));
// Right part
FDCT8x4_sse(ref Unsafe.As<Vector4, Vector128<float>>(ref block.V0R));
}
}
/// <summary>
/// Apply 1D floating point FDCT inplace using SSE operations on 8x4 part of 8x8 matrix.
/// </summary>
/// <remarks>
/// Requires Sse support.
/// Must be called on both 8x4 matrix parts for the full FDCT transform.
/// </remarks>
/// <param name="blockRef">Input reference to the first </param>
public static void FDCT8x4_sse(ref Vector128<float> blockRef)
{
DebugGuard.IsTrue(Sse.IsSupported, "Sse support is required to execute this operation.");
Vector128<float> tmp0 = Sse.Add(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14));
Vector128<float> tmp7 = Sse.Subtract(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14));
Vector128<float> tmp1 = Sse.Add(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12));
Vector128<float> tmp6 = Sse.Subtract(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12));
Vector128<float> tmp2 = Sse.Add(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10));
Vector128<float> tmp5 = Sse.Subtract(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10));
Vector128<float> tmp3 = Sse.Add(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8));
Vector128<float> tmp4 = Sse.Subtract(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8));
// Even part
Vector128<float> tmp10 = Sse.Add(tmp0, tmp3);
Vector128<float> tmp13 = Sse.Subtract(tmp0, tmp3);
Vector128<float> tmp11 = Sse.Add(tmp1, tmp2);
Vector128<float> tmp12 = Sse.Subtract(tmp1, tmp2);
Unsafe.Add(ref blockRef, 0) = Sse.Add(tmp10, tmp11);
Unsafe.Add(ref blockRef, 8) = Sse.Subtract(tmp10, tmp11);
Vector128<float> z1 = Sse.Multiply(Sse.Add(tmp12, tmp13), mm128_F_0_7071);
Unsafe.Add(ref blockRef, 4) = Sse.Add(tmp13, z1);
Unsafe.Add(ref blockRef, 12) = Sse.Subtract(tmp13, z1);
// Odd part
tmp10 = Sse.Add(tmp4, tmp5);
tmp11 = Sse.Add(tmp5, tmp6);
tmp12 = Sse.Add(tmp6, tmp7);
Vector128<float> z5 = Sse.Multiply(Sse.Subtract(tmp10, tmp12), mm128_F_0_3826);
Vector128<float> z2 = Sse.Add(Sse.Multiply(mm128_F_0_5411, tmp10), z5);
Vector128<float> z4 = Sse.Add(Sse.Multiply(mm128_F_1_3065, tmp12), z5);
Vector128<float> z3 = Sse.Multiply(tmp11, mm128_F_0_7071);
Vector128<float> z11 = Sse.Add(tmp7, z3);
Vector128<float> z13 = Sse.Subtract(tmp7, z3);
Unsafe.Add(ref blockRef, 10) = Sse.Add(z13, z2);
Unsafe.Add(ref blockRef, 6) = Sse.Subtract(z13, z2);
Unsafe.Add(ref blockRef, 2) = Sse.Add(z11, z4);
Unsafe.Add(ref blockRef, 14) = Sse.Subtract(z11, z4);
}
/// <summary>
/// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix.
/// </summary>
/// <remarks>
/// Requires Avx support.
/// </remarks>
/// <param name="block">Input matrix.</param>
public static void FDCT8x8_avx(ref Block8x8F block)
{
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
// Even part
Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
block.V0 = Avx.Add(tmp10, tmp11);
block.V4 = Avx.Subtract(tmp10, tmp11);
Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
block.V2 = Avx.Add(tmp13, z1);
block.V6 = Avx.Subtract(tmp13, z1);
// Odd part
tmp10 = Avx.Add(tmp4, tmp5);
tmp11 = Avx.Add(tmp5, tmp6);
tmp12 = Avx.Add(tmp6, tmp7);
Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
Vector256<float> z2 = Avx.Add(Avx.Multiply(mm256_F_0_5411, tmp10), z5);
Vector256<float> z4 = Avx.Add(Avx.Multiply(mm256_F_1_3065, tmp12), z5);
Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
Vector256<float> z11 = Avx.Add(tmp7, z3);
Vector256<float> z13 = Avx.Subtract(tmp7, z3);
block.V5 = Avx.Add(z13, z2);
block.V3 = Avx.Subtract(z13, z2);
block.V1 = Avx.Add(z11, z4);
block.V7 = Avx.Subtract(z11, z4);
}
}
}
#endif

400
src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs

@ -46,11 +46,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector256<float> C_V_0_5411 = Vector256.Create(0.541196f);
private static readonly Vector256<float> C_V_1_3065 = Vector256.Create(1.306563f);
private static readonly Vector256<float> C_V_1_1758 = Vector256.Create(1.175876f);
private static readonly Vector256<float> C_V_0_7856 = Vector256.Create(0.785695f);
private static readonly Vector256<float> C_V_1_3870 = Vector256.Create(1.387040f);
private static readonly Vector256<float> C_V_0_2758 = Vector256.Create(0.275899f);
private static readonly Vector256<float> C_V_n1_9615 = Vector256.Create(-1.961570560f);
private static readonly Vector256<float> C_V_n0_3901 = Vector256.Create(-0.390180644f);
@ -62,250 +58,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
private static readonly Vector256<float> C_V_1_5013 = Vector256.Create(1.501321110f);
private static readonly Vector256<float> C_V_n1_8477 = Vector256.Create(-1.847759065f);
private static readonly Vector256<float> C_V_0_7653 = Vector256.Create(0.765366865f);
private static readonly Vector256<float> C_V_InvSqrt2 = Vector256.Create(0.707107f);
#endif
#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f);
/// <summary>
/// Original:
/// <see>
/// <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
/// </see>
/// </summary>
/// <param name="s">Source</param>
/// <param name="d">Destination</param>
public static void FDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
{
Vector4 c0 = s.V0L;
Vector4 c1 = s.V7L;
Vector4 t0 = c0 + c1;
Vector4 t7 = c0 - c1;
c1 = s.V6L;
c0 = s.V1L;
Vector4 t1 = c0 + c1;
Vector4 t6 = c0 - c1;
c1 = s.V5L;
c0 = s.V2L;
Vector4 t2 = c0 + c1;
Vector4 t5 = c0 - c1;
c0 = s.V3L;
c1 = s.V4L;
Vector4 t3 = c0 + c1;
Vector4 t4 = c0 - c1;
c0 = t0 + t3;
Vector4 c3 = t0 - t3;
c1 = t1 + t2;
Vector4 c2 = t1 - t2;
d.V0L = c0 + c1;
d.V4L = c0 - c1;
float w0 = 0.541196f;
float w1 = 1.306563f;
d.V2L = (w0 * c2) + (w1 * c3);
d.V6L = (w0 * c3) - (w1 * c2);
w0 = 1.175876f;
w1 = 0.785695f;
c3 = (w0 * t4) + (w1 * t7);
c0 = (w0 * t7) - (w1 * t4);
w0 = 1.387040f;
w1 = 0.275899f;
c2 = (w0 * t5) + (w1 * t6);
c1 = (w0 * t6) - (w1 * t5);
d.V3L = c0 - c2;
d.V5L = c3 - c1;
float invsqrt2 = 0.707107f;
c0 = (c0 + c2) * invsqrt2;
c3 = (c3 + c1) * invsqrt2;
d.V1L = c0 + c3;
d.V7L = c0 - c3;
}
/// <summary>
/// Original:
/// <see>
/// <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
/// </see>
/// </summary>
/// <param name="s">Source</param>
/// <param name="d">Destination</param>
public static void FDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
{
Vector4 c0 = s.V0R;
Vector4 c1 = s.V7R;
Vector4 t0 = c0 + c1;
Vector4 t7 = c0 - c1;
c1 = s.V6R;
c0 = s.V1R;
Vector4 t1 = c0 + c1;
Vector4 t6 = c0 - c1;
c1 = s.V5R;
c0 = s.V2R;
Vector4 t2 = c0 + c1;
Vector4 t5 = c0 - c1;
c0 = s.V3R;
c1 = s.V4R;
Vector4 t3 = c0 + c1;
Vector4 t4 = c0 - c1;
c0 = t0 + t3;
Vector4 c3 = t0 - t3;
c1 = t1 + t2;
Vector4 c2 = t1 - t2;
d.V0R = c0 + c1;
d.V4R = c0 - c1;
float w0 = 0.541196f;
float w1 = 1.306563f;
d.V2R = (w0 * c2) + (w1 * c3);
d.V6R = (w0 * c3) - (w1 * c2);
w0 = 1.175876f;
w1 = 0.785695f;
c3 = (w0 * t4) + (w1 * t7);
c0 = (w0 * t7) - (w1 * t4);
w0 = 1.387040f;
w1 = 0.275899f;
c2 = (w0 * t5) + (w1 * t6);
c1 = (w0 * t6) - (w1 * t5);
d.V3R = c0 - c2;
d.V5R = c3 - c1;
c0 = (c0 + c2) * InvSqrt2;
c3 = (c3 + c1) * InvSqrt2;
d.V1R = c0 + c3;
d.V7R = c0 - c3;
}
/// <summary>
/// Combined operation of <see cref="FDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="FDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
/// using AVX commands.
/// </summary>
/// <param name="s">Source</param>
/// <param name="d">Destination</param>
public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
{
#if SUPPORTS_RUNTIME_INTRINSICS
Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
Vector256<float> t0 = Avx.Add(s.V0, s.V7);
Vector256<float> t7 = Avx.Subtract(s.V0, s.V7);
Vector256<float> t1 = Avx.Add(s.V1, s.V6);
Vector256<float> t6 = Avx.Subtract(s.V1, s.V6);
Vector256<float> t2 = Avx.Add(s.V2, s.V5);
Vector256<float> t5 = Avx.Subtract(s.V2, s.V5);
Vector256<float> t3 = Avx.Add(s.V3, s.V4);
Vector256<float> t4 = Avx.Subtract(s.V3, s.V4);
Vector256<float> c0 = Avx.Add(t0, t3);
Vector256<float> c1 = Avx.Add(t1, t2);
// 0 4
d.V0 = Avx.Add(c0, c1);
d.V4 = Avx.Subtract(c0, c1);
Vector256<float> c3 = Avx.Subtract(t0, t3);
Vector256<float> c2 = Avx.Subtract(t1, t2);
// 2 6
d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065);
d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411);
c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856);
c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758);
c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6);
c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870);
// 3 5
d.V3 = Avx.Subtract(c0, c2);
d.V5 = Avx.Subtract(c3, c1);
c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2);
c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2);
// 1 7
d.V1 = Avx.Add(c0, c3);
d.V7 = Avx.Subtract(c0, c3);
#endif
}
/// <summary>
/// Performs 8x8 matrix Forward Discrete Cosine Transform
/// </summary>
/// <param name="s">Source</param>
/// <param name="d">Destination</param>
public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{
FDCT8x8_Avx(ref s, ref d);
}
else
#endif
{
FDCT8x4_LeftPart(ref s, ref d);
FDCT8x4_RightPart(ref s, ref d);
}
}
/// <summary>
/// Apply floating point FDCT from src into dest
/// </summary>
/// <param name="src">Source</param>
/// <param name="dest">Destination</param>
/// <param name="temp">Temporary block provided by the caller for optimization</param>
public static void TransformFDCT(
ref Block8x8F src,
ref Block8x8F dest,
ref Block8x8F temp)
{
src.TransposeInto(ref temp);
FDCT8x8(ref temp, ref dest);
dest.TransposeInto(ref temp);
FDCT8x8(ref temp, ref dest);
dest.MultiplyInPlace(C_0_125);
}
/// <summary>
/// Apply floating point FDCT inplace.
/// </summary>
/// <param name="matrix">Input matrix.</param>
/// <param name="temp">Matrix to store temporal results.</param>
public static void TransformInplaceFDCT(ref Block8x8F matrix, ref Block8x8F temp)
{
matrix.TransposeInto(ref temp);
FDCT8x8(ref temp, ref matrix);
matrix.TransposeInto(ref temp);
FDCT8x8(ref temp, ref matrix);
matrix.MultiplyInPlace(C_0_125);
}
/// <summary>
/// Performs 8x8 matrix Inverse Discrete Cosine Transform
/// </summary>
@ -501,40 +255,148 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
}
/// <summary>
/// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
/// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
/// Apply floating point IDCT inplace.
/// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
/// </summary>
/// <param name="src">Source</param>
/// <param name="dest">Destination</param>
/// <param name="temp">Temporary block provided by the caller</param>
public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
/// <param name="block">Input matrix.</param>
/// <param name="temp">Matrix to store temporal results.</param>
public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
{
src.TransposeInto(ref temp);
IDCT8x8(ref temp, ref dest);
dest.TransposeInto(ref temp);
IDCT8x8(ref temp, ref dest);
block.Transpose();
IDCT8x8(ref block, ref temp);
temp.Transpose();
IDCT8x8(ref temp, ref block);
// TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
dest.MultiplyInPlace(C_0_125);
block.MultiplyInPlace(C_0_125);
}
/// <summary>
/// Apply floating point IDCT inplace.
/// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
/// Apply 2D floating point FDCT inplace using scalar operations.
/// </summary>
/// <param name="matrix">Input matrix.</param>
/// <param name="temp">Matrix to store temporal results.</param>
public static void TransformInplaceIDCT(ref Block8x8F block, ref Block8x8F temp)
/// <remarks>
/// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
/// </remarks>
/// <param name="block">Input matrix.</param>
private static void ForwardTransformScalar(ref Block8x8F block)
{
block.TransposeInto(ref temp);
const int dctSize = 8;
IDCT8x8(ref temp, ref block);
block.TransposeInto(ref temp);
IDCT8x8(ref temp, ref block);
float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
float tmp10, tmp11, tmp12, tmp13;
float z1, z2, z3, z4, z5, z11, z13;
// TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
block.MultiplyInPlace(C_0_125);
// First pass - process rows
ref float dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
for (int ctr = 7; ctr >= 0; ctr--)
{
tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7);
tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7);
tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6);
tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6);
tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5);
tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5);
tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4);
tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4);
// Even part
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11;
Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11;
z1 = (tmp12 + tmp13) * 0.707106781f;
Unsafe.Add(ref dataRef, 2) = tmp13 + z1;
Unsafe.Add(ref dataRef, 6) = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
z5 = (tmp10 - tmp12) * 0.382683433f;
z2 = (0.541196100f * tmp10) + z5;
z4 = (1.306562965f * tmp12) + z5;
z3 = tmp11 * 0.707106781f;
z11 = tmp7 + z3;
z13 = tmp7 - z3;
Unsafe.Add(ref dataRef, 5) = z13 + z2;
Unsafe.Add(ref dataRef, 3) = z13 - z2;
Unsafe.Add(ref dataRef, 1) = z11 + z4;
Unsafe.Add(ref dataRef, 7) = z11 - z4;
dataRef = ref Unsafe.Add(ref dataRef, dctSize);
}
// Second pass - process columns
dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
for (int ctr = 7; ctr >= 0; ctr--)
{
tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7);
tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7);
tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6);
tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6);
tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5);
tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5);
tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4);
tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4);
// Even part
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11;
Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11;
z1 = (tmp12 + tmp13) * 0.707106781f;
Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1;
Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
z5 = (tmp10 - tmp12) * 0.382683433f;
z2 = (0.541196100f * tmp10) + z5;
z4 = (1.306562965f * tmp12) + z5;
z3 = tmp11 * 0.707106781f;
z11 = tmp7 + z3;
z13 = tmp7 - z3;
Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2;
Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2;
Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4;
Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4;
dataRef = ref Unsafe.Add(ref dataRef, 1);
}
}
/// <summary>
/// Apply 2D floating point FDCT inplace.
/// </summary>
/// <param name="block">Input matrix.</param>
public static void TransformFDCT(ref Block8x8F block)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported || Sse.IsSupported)
{
ForwardTransformSimd(ref block);
}
else
#endif
{
ForwardTransformScalar(ref block);
}
}
}
}

108
src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs

@ -10,10 +10,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
internal static partial class ZigZag
{
#pragma warning disable SA1309 // naming rules violation warnings
/// <summary>
/// Special byte value to zero out elements during Sse/Avx shuffle intrinsics.
/// </summary>
private const byte Z = 0xff;
private const byte _ = 0xff;
#pragma warning restore SA1309
/// <summary>
/// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSse"/>
@ -22,82 +24,82 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
{
// 0_A
0, 1, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, 6, 7, Z, Z,
0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
// 0_B
Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5,
_, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
// 0_C
Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
// 1_A
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, 10, 11,
_, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
// 1_B
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z, Z, Z,
_, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _,
// 1_C
2, 3, Z, Z, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z,
2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
// 1_D
Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
// 1_E
Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _,
// 2_B
8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// 2_C
Z, Z, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
// 2_D
Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _,
// 2_E
Z, Z, Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5,
_, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
// 2_F
Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z,
_, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
// 2_G
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z,
_, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _,
// 3_A
Z, Z, Z, Z, Z, Z, 12, 13, 14, 15, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
// 3_B
Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z,
_, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
// 3_C
Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z,
_, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
// 3_D/4_E
6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9,
6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
// 4_F
Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z,
_, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
// 4_G
Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z,
_, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
// 4_H
Z, Z, Z, Z, Z, Z, 0, 1, 2, 3, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
// 5_B
Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _,
// 5_C
Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
// 5_D
10, 11, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z, Z, Z,
10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
// 5_E
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z,
_, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _,
// 5_F
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, Z, Z,
_, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _,
// 5_G
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7,
_, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7,
// 6_D
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z,
_, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _,
// 6_E
Z, Z, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z,
_, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
// 6_F
Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13,
_, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
// 6_G
Z, Z, Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _,
// 6_H
4, 5, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
// 7_F
Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _,
// 7_G
10, 11, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z,
10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
// 7_H
Z, Z, 8, 9, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, 14, 15
_, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
};
/// <summary>
@ -110,55 +112,55 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0,
// 01_AB - inner-lane
0, 1, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, 6, 7, 12, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, 6, 7,
0, 1, 2, 3, 8, 9, _, _, 10, 11, 4, 5, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, 6, 7,
// 01_CD/23_GH - cross-lane
0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, Z, Z, Z, Z, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, Z, Z, Z, Z,
0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _,
// 01_CD - inner-lane
Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _,
// 01_EF - inner-lane
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _,
// 23_AB/45_CD/67_EF - cross-lane
3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, Z, Z, Z, Z, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, Z, Z, Z, Z,
3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _,
// 23_AB - inner-lane
4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, 2, 3, 8, 9, Z, Z, Z, Z,
4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, 2, 3, 8, 9, _, _, _, _,
// 23_CD - inner-lane
Z, Z, 6, 7, 12, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 12, 13,
_, _, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, 6, 7, 12, 13,
// 23_EF - inner-lane
Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// 23_GH - inner-lane
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// 45_AB - inner-lane
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _,
// 45_CD - inner-lane
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, Z, Z, 2, 3, 8, 9, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _,
// 45_EF - cross-lane
1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, Z, Z, Z, Z, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0,
1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, _, _, _, _, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0,
// 45_EF - inner-lane
2, 3, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z,
2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _,
// 45_GH - inner-lane
Z, Z, Z, Z, 2, 3, 8, 9, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7,
_, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7,
// 67_CD - inner-lane
Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
// 67_EF - inner-lane
Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, Z, Z, 2, 3, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z,
_, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _,
// 67_GH - inner-lane
8, 9, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, 10, 11, 4, 5, Z, Z, 6, 7, 12, 13, 14, 15
8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, 6, 7, 12, 13, 14, 15
};
/// <summary>

8
tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs

@ -12,15 +12,11 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
private static readonly Block8x8F Source = Create8x8FloatData();
[Benchmark]
public void TransposeInto()
{
var dest = default(Block8x8F);
Source.TransposeInto(ref dest);
}
public void TransposeInto() => Source.Transpose();
private static Block8x8F Create8x8FloatData()
{
var result = new float[64];
float[] result = new float[64];
for (int i = 0; i < 8; i++)
{
for (int j = 0; j < 8; j++)

50
tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs

@ -164,52 +164,27 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
}
[Fact]
public void TransposeInto()
public void Transpose()
{
static void RunTest()
{
float[] expected = Create8x8FloatData();
ReferenceImplementations.Transpose8x8(expected);
var source = default(Block8x8F);
source.LoadFrom(Create8x8FloatData());
var block8x8 = default(Block8x8F);
block8x8.LoadFrom(Create8x8FloatData());
var dest = default(Block8x8F);
source.TransposeInto(ref dest);
block8x8.Transpose();
float[] actual = new float[64];
dest.ScaledCopyTo(actual);
block8x8.ScaledCopyTo(actual);
Assert.Equal(expected, actual);
}
FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest,
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX);
}
private class BufferHolder
{
public Block8x8F Buffer;
}
[Fact]
public void TransposeInto_Benchmark()
{
var source = new BufferHolder();
source.Buffer.LoadFrom(Create8x8FloatData());
var dest = new BufferHolder();
this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark X {Times} ...");
var sw = Stopwatch.StartNew();
for (int i = 0; i < Times; i++)
{
source.Buffer.TransposeInto(ref dest.Buffer);
}
sw.Stop();
this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark finished in {sw.ElapsedMilliseconds} ms");
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
}
private static float[] Create8x8ColorCropTestData()
@ -281,16 +256,21 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed);
Block8x8F quant = CreateRandomFloatBlock(-2000, 2000, qtSeed);
// Reference implementation quantizes given block via division
Block8x8 expected = default;
ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder);
// Actual current implementation quantizes given block via multiplication
// With quantization table reciprocal
for (int i = 0; i < Block8x8F.Size; i++)
{
quant[i] = 1f / quant[i];
}
Block8x8 actual = default;
Block8x8F.Quantize(ref source, ref actual, ref quant);
for (int i = 0; i < Block8x8.Size; i++)
{
Assert.Equal(expected[i], actual[i]);
}
this.CompareBlocks(expected, actual, 1);
}
[Fact]

149
tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs

@ -2,6 +2,9 @@
// Licensed under the Apache License, Version 2.0.
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics.X86;
#endif
@ -33,15 +36,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
{
float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
var source = Block8x8F.Load(sourceArray);
var srcBlock = Block8x8F.Load(sourceArray);
Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source);
Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock);
var temp = default(Block8x8F);
var actual = default(Block8x8F);
FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);
this.CompareBlocks(expected, actual, 1f);
this.CompareBlocks(expected, srcBlock, 1f);
}
[Theory]
@ -52,15 +54,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
{
float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
var source = Block8x8F.Load(sourceArray);
var srcBlock = Block8x8F.Load(sourceArray);
Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source);
Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock);
var temp = default(Block8x8F);
var actual = default(Block8x8F);
FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);
this.CompareBlocks(expected, actual, 1f);
this.CompareBlocks(expected, srcBlock, 1f);
}
// Inverse transform
@ -167,8 +168,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
var destBlock = default(Block8x8F);
var expectedDest = new float[64];
var temp1 = new float[64];
var temp2 = default(Block8x8F);
@ -177,10 +176,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1);
// testee
FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2);
FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2);
var actualDest = new float[64];
destBlock.ScaledCopyTo(actualDest);
srcBlock.ScaledCopyTo(actualDest);
Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
}
@ -198,95 +197,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
}
// Forward transform
[Theory]
[InlineData(1)]
[InlineData(2)]
public void FDCT8x4_LeftPart(int seed)
{
Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
var destBlock = default(Block8x8F);
var expectedDest = new float[64];
// reference
ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
// testee
FastFloatingPointDCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock);
var actualDest = new float[64];
destBlock.ScaledCopyTo(actualDest);
Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
}
[Theory]
[InlineData(1)]
[InlineData(2)]
public void FDCT8x4_RightPart(int seed)
{
Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
var destBlock = default(Block8x8F);
var expectedDest = new float[64];
// reference
ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
// testee
FastFloatingPointDCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock);
var actualDest = new float[64];
destBlock.ScaledCopyTo(actualDest);
Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
}
[Theory]
[InlineData(1)]
[InlineData(2)]
public void FDCT8x8_Avx(int seed)
{
#if SUPPORTS_RUNTIME_INTRINSICS
var skip = !Avx.IsSupported;
#else
var skip = true;
#endif
if (skip)
{
this.Output.WriteLine("No AVX present, skipping test!");
return;
}
Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
var destBlock = default(Block8x8F);
var expectedDest = new float[64];
// reference, left part
ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
// reference, right part
ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
// testee, whole 8x8
FastFloatingPointDCT.FDCT8x8_Avx(ref srcBlock, ref destBlock);
var actualDest = new float[64];
destBlock.ScaledCopyTo(actualDest);
Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
}
// This test covers entire FDCT conversions chain
// This test checks all implementations: intrinsic and scalar fallback
[Theory]
[InlineData(1)]
[InlineData(2)]
@ -297,37 +209,38 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
int seed = FeatureTestRunner.Deserialize<int>(serialized);
Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
var destBlock = default(Block8x8F);
var block = default(Block8x8F);
block.LoadFrom(src);
var expectedDest = new float[64];
var temp1 = new float[64];
var temp2 = default(Block8x8F);
float[] expectedDest = new float[64];
float[] temp1 = new float[64];
// reference
ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
// testee
FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2);
// Part of the FDCT calculations is fused into the quantization step
// We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen
FastFloatingPointDCT.TransformFDCT(ref block);
for (int i = 0; i < 64; i++)
{
block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i];
}
var actualDest = new float[64];
destBlock.ScaledCopyTo(actualDest);
float[] actualDest = block.ToArray();
Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f));
}
// 3 paths:
// 1. AllowAll - call avx/fma implementation
// 2. DisableFMA - call avx implementation without fma acceleration
// 3. DisableAvx - call fallback code of Vector4 implementation
//
// DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
// 3. DisableAvx - call sse implementation
// 4. DisableHWIntrinsic - call scalar fallback implementation
FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest,
seed,
HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX);
HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
}
}
}

Loading…
Cancel
Save