Browse Source

Modernize additional V256 code from review

pull/2918/head
James Jackson-South 1 year ago
parent
commit
6238f00895
  1. 23
      src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
  2. 22
      src/ImageSharp/Common/Helpers/Vector256Utilities.cs
  3. 4
      src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
  4. 142
      src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
  5. 142
      src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs
  6. 14
      src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
  7. 2
      tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
  8. 2
      tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs

23
src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

@ -619,29 +619,6 @@ internal static partial class SimdUtils
return va + (vm0 * vm1);
}
/// <summary>
/// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
/// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
/// </summary>
/// <remarks>ret = (vm0 * vm1) - vs</remarks>
/// <param name="vs">The vector to subtract from the intermediate result.</param>
/// <param name="vm0">The first vector to multiply.</param>
/// <param name="vm1">The second vector to multiply.</param>
/// <returns>The <see cref="Vector256{T}"/>.</returns>
[MethodImpl(InliningOptions.ShortMethod)]
public static Vector256<float> MultiplySubtract(
Vector256<float> vs,
Vector256<float> vm0,
Vector256<float> vm1)
{
if (Fma.IsSupported)
{
return Fma.MultiplySubtract(vm1, vm0, vs);
}
return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
}
/// <summary>
/// Performs a multiplication and a negated addition of the <see cref="Vector256{Single}"/>.
/// </summary>

22
src/ImageSharp/Common/Helpers/Vector256Utilities.cs

@ -140,6 +140,28 @@ internal static class Vector256_
return va + (vm0 * vm1);
}
/// <summary>
/// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
/// </summary>
/// <remarks>ret = (vm0 * vm1) - vs</remarks>
/// <param name="vs">The vector to subtract from the intermediate result.</param>
/// <param name="vm0">The first vector to multiply.</param>
/// <param name="vm1">The second vector to multiply.</param>
/// <returns>The <see cref="Vector256{T}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> MultiplySubtract(
Vector256<float> vs,
Vector256<float> vm0,
Vector256<float> vm1)
{
if (Fma.IsSupported)
{
return Fma.MultiplySubtract(vm1, vm0, vs);
}
return (vm0 * vm1) - vs;
}
/// <summary>
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
/// </summary>

4
src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs

@ -211,10 +211,10 @@ internal partial struct Block8x8
}
/// <summary>
/// Transpose the block inplace.
/// Transpose the block in place.
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
public void TransposeInplace()
public void TransposeInPlace()
{
ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);

142
src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs

@ -1,142 +0,0 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal static partial class FloatingPointDCT
{
/// <summary>
/// Apply floating point FDCT inplace using simd operations.
/// </summary>
/// <param name="block">Input block.</param>
private static void FDCT8x8_Avx(ref Block8x8F block)
{
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
// First pass - process columns
FDCT8x8_1D_Avx(ref block);
// Second pass - process rows
block.TransposeInPlace();
FDCT8x8_1D_Avx(ref block);
// Applies 1D floating point FDCT inplace
static void FDCT8x8_1D_Avx(ref Block8x8F block)
{
Vector256<float> tmp0 = Avx.Add(block.V256_0, block.V256_7);
Vector256<float> tmp7 = Avx.Subtract(block.V256_0, block.V256_7);
Vector256<float> tmp1 = Avx.Add(block.V256_1, block.V256_6);
Vector256<float> tmp6 = Avx.Subtract(block.V256_1, block.V256_6);
Vector256<float> tmp2 = Avx.Add(block.V256_2, block.V256_5);
Vector256<float> tmp5 = Avx.Subtract(block.V256_2, block.V256_5);
Vector256<float> tmp3 = Avx.Add(block.V256_3, block.V256_4);
Vector256<float> tmp4 = Avx.Subtract(block.V256_3, block.V256_4);
// Even part
Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
block.V256_0 = Avx.Add(tmp10, tmp11);
block.V256_4 = Avx.Subtract(tmp10, tmp11);
var mm256_F_0_7071 = Vector256.Create(0.707106781f);
Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
block.V256_2 = Avx.Add(tmp13, z1);
block.V256_6 = Avx.Subtract(tmp13, z1);
// Odd part
tmp10 = Avx.Add(tmp4, tmp5);
tmp11 = Avx.Add(tmp5, tmp6);
tmp12 = Avx.Add(tmp6, tmp7);
Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), Vector256.Create(0.382683433f)); // mm256_F_0_3826
Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411
Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065
Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
Vector256<float> z11 = Avx.Add(tmp7, z3);
Vector256<float> z13 = Avx.Subtract(tmp7, z3);
block.V256_5 = Avx.Add(z13, z2);
block.V256_3 = Avx.Subtract(z13, z2);
block.V256_1 = Avx.Add(z11, z4);
block.V256_7 = Avx.Subtract(z11, z4);
}
}
/// <summary>
/// Apply floating point IDCT inplace using simd operations.
/// </summary>
/// <param name="transposedBlock">Transposed input block.</param>
private static void IDCT8x8_Avx(ref Block8x8F transposedBlock)
{
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
// First pass - process columns
IDCT8x8_1D_Avx(ref transposedBlock);
// Second pass - process rows
transposedBlock.TransposeInPlace();
IDCT8x8_1D_Avx(ref transposedBlock);
// Applies 1D floating point FDCT inplace
static void IDCT8x8_1D_Avx(ref Block8x8F block)
{
// Even part
Vector256<float> tmp0 = block.V256_0;
Vector256<float> tmp1 = block.V256_2;
Vector256<float> tmp2 = block.V256_4;
Vector256<float> tmp3 = block.V256_6;
Vector256<float> z5 = tmp0;
Vector256<float> tmp10 = Avx.Add(z5, tmp2);
Vector256<float> tmp11 = Avx.Subtract(z5, tmp2);
var mm256_F_1_4142 = Vector256.Create(1.414213562f);
Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
tmp0 = Avx.Add(tmp10, tmp13);
tmp3 = Avx.Subtract(tmp10, tmp13);
tmp1 = Avx.Add(tmp11, tmp12);
tmp2 = Avx.Subtract(tmp11, tmp12);
// Odd part
Vector256<float> tmp4 = block.V256_1;
Vector256<float> tmp5 = block.V256_3;
Vector256<float> tmp6 = block.V256_5;
Vector256<float> tmp7 = block.V256_7;
Vector256<float> z13 = Avx.Add(tmp6, tmp5);
Vector256<float> z10 = Avx.Subtract(tmp6, tmp5);
Vector256<float> z11 = Avx.Add(tmp4, tmp7);
Vector256<float> z12 = Avx.Subtract(tmp4, tmp7);
tmp7 = Avx.Add(z11, z13);
tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);
z5 = Avx.Multiply(Avx.Add(z10, z12), Vector256.Create(1.847759065f)); // mm256_F_1_8477
tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823
tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131
tmp6 = Avx.Subtract(tmp12, tmp7);
tmp5 = Avx.Subtract(tmp11, tmp6);
tmp4 = Avx.Subtract(tmp10, tmp5);
block.V256_0 = Avx.Add(tmp0, tmp7);
block.V256_7 = Avx.Subtract(tmp0, tmp7);
block.V256_1 = Avx.Add(tmp1, tmp6);
block.V256_6 = Avx.Subtract(tmp1, tmp6);
block.V256_2 = Avx.Add(tmp2, tmp5);
block.V256_5 = Avx.Subtract(tmp2, tmp5);
block.V256_3 = Avx.Add(tmp3, tmp4);
block.V256_4 = Avx.Subtract(tmp3, tmp4);
}
}
}

142
src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs

@ -0,0 +1,142 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal static partial class FloatingPointDCT
{
/// <summary>
/// Apply floating point FDCT in place using simd operations.
/// </summary>
/// <param name="block">Input block.</param>
private static void FDCT8x8_Vector256(ref Block8x8F block)
{
DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
// First pass - process columns
FDCT8x8_1D_Vector256(ref block);
// Second pass - process rows
block.TransposeInPlace();
FDCT8x8_1D_Vector256(ref block);
// Applies 1D floating point FDCT in place
static void FDCT8x8_1D_Vector256(ref Block8x8F block)
{
Vector256<float> tmp0 = block.V256_0 + block.V256_7;
Vector256<float> tmp7 = block.V256_0 - block.V256_7;
Vector256<float> tmp1 = block.V256_1 + block.V256_6;
Vector256<float> tmp6 = block.V256_1 - block.V256_6;
Vector256<float> tmp2 = block.V256_2 + block.V256_5;
Vector256<float> tmp5 = block.V256_2 - block.V256_5;
Vector256<float> tmp3 = block.V256_3 + block.V256_4;
Vector256<float> tmp4 = block.V256_3 - block.V256_4;
// Even part
Vector256<float> tmp10 = tmp0 + tmp3;
Vector256<float> tmp13 = tmp0 - tmp3;
Vector256<float> tmp11 = tmp1 + tmp2;
Vector256<float> tmp12 = tmp1 - tmp2;
block.V256_0 = tmp10 + tmp11;
block.V256_4 = tmp10 - tmp11;
Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
Vector256<float> z1 = (tmp12 + tmp13) * mm256_F_0_7071;
block.V256_2 = tmp13 + z1;
block.V256_6 = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
Vector256<float> z5 = (tmp10 - tmp12) * Vector256.Create(0.382683433f); // mm256_F_0_3826
Vector256<float> z2 = Vector256_.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411
Vector256<float> z4 = Vector256_.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065
Vector256<float> z3 = tmp11 * mm256_F_0_7071;
Vector256<float> z11 = tmp7 + z3;
Vector256<float> z13 = tmp7 - z3;
block.V256_5 = z13 + z2;
block.V256_3 = z13 - z2;
block.V256_1 = z11 + z4;
block.V256_7 = z11 - z4;
}
}
/// <summary>
/// Apply floating point IDCT in place using simd operations.
/// </summary>
/// <param name="transposedBlock">Transposed input block.</param>
private static void IDCT8x8_Vector256(ref Block8x8F transposedBlock)
{
DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
// First pass - process columns
IDCT8x8_1D_Vector256(ref transposedBlock);
// Second pass - process rows
transposedBlock.TransposeInPlace();
IDCT8x8_1D_Vector256(ref transposedBlock);
// Applies 1D floating point FDCT in place
static void IDCT8x8_1D_Vector256(ref Block8x8F block)
{
// Even part
Vector256<float> tmp0 = block.V256_0;
Vector256<float> tmp1 = block.V256_2;
Vector256<float> tmp2 = block.V256_4;
Vector256<float> tmp3 = block.V256_6;
Vector256<float> z5 = tmp0;
Vector256<float> tmp10 = z5 + tmp2;
Vector256<float> tmp11 = z5 - tmp2;
Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
Vector256<float> tmp13 = tmp1 + tmp3;
Vector256<float> tmp12 = Vector256_.MultiplySubtract(tmp13, tmp1 - tmp3, mm256_F_1_4142);
tmp0 = tmp10 + tmp13;
tmp3 = tmp10 - tmp13;
tmp1 = tmp11 + tmp12;
tmp2 = tmp11 - tmp12;
// Odd part
Vector256<float> tmp4 = block.V256_1;
Vector256<float> tmp5 = block.V256_3;
Vector256<float> tmp6 = block.V256_5;
Vector256<float> tmp7 = block.V256_7;
Vector256<float> z13 = tmp6 + tmp5;
Vector256<float> z10 = tmp6 - tmp5;
Vector256<float> z11 = tmp4 + tmp7;
Vector256<float> z12 = tmp4 - tmp7;
tmp7 = z11 + z13;
tmp11 = (z11 - z13) * mm256_F_1_4142;
z5 = (z10 + z12) * Vector256.Create(1.847759065f); // mm256_F_1_8477
tmp10 = Vector256_.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823
tmp12 = Vector256_.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131
tmp6 = tmp12 - tmp7;
tmp5 = tmp11 - tmp6;
tmp4 = tmp10 - tmp5;
block.V256_0 = tmp0 + tmp7;
block.V256_7 = tmp0 - tmp7;
block.V256_1 = tmp1 + tmp6;
block.V256_6 = tmp1 - tmp6;
block.V256_2 = tmp2 + tmp5;
block.V256_5 = tmp2 - tmp5;
block.V256_3 = tmp3 + tmp4;
block.V256_4 = tmp3 - tmp4;
}
}
}

14
src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs

@ -4,7 +4,7 @@
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
@ -101,7 +101,7 @@ internal static partial class FloatingPointDCT
}
/// <summary>
/// Apply 2D floating point IDCT inplace.
/// Apply 2D floating point IDCT in place.
/// </summary>
/// <remarks>
/// Input block must be dequantized with quantization table
@ -110,9 +110,9 @@ internal static partial class FloatingPointDCT
/// <param name="block">Input block.</param>
public static void TransformIDCT(ref Block8x8F block)
{
if (Avx.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
IDCT8x8_Avx(ref block);
IDCT8x8_Vector256(ref block);
}
else
{
@ -121,7 +121,7 @@ internal static partial class FloatingPointDCT
}
/// <summary>
/// Apply 2D floating point IDCT inplace.
/// Apply 2D floating point IDCT in place.
/// </summary>
/// <remarks>
/// Input block must be quantized after this method with quantization
@ -130,9 +130,9 @@ internal static partial class FloatingPointDCT
/// <param name="block">Input block.</param>
public static void TransformFDCT(ref Block8x8F block)
{
if (Avx.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
FDCT8x8_Avx(ref block);
FDCT8x8_Vector256(ref block);
}
else
{

2
tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs

@ -271,7 +271,7 @@ public class Block8x8Tests : JpegFixture
Block8x8 block8x8 = Block8x8.Load(Create8x8ShortData());
block8x8.TransposeInplace();
block8x8.TransposeInPlace();
short[] actual = new short[64];
block8x8.CopyTo(actual);

2
tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs

@ -60,7 +60,7 @@ internal static partial class LibJpegTools
internal void MakeBlock(Block8x8 block, int y, int x)
{
block.TransposeInplace();
block.TransposeInPlace();
this.MakeBlock(block.ToArray(), y, x);
}

Loading…
Cancel
Save