Browse Source

Clean up and prep for Vector512 multiply

pull/2918/head
James Jackson-South 1 year ago
parent
commit
29a56350ce
  1. 4
      src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
  2. 37
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  3. 1
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
  4. 45
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs
  5. 67
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
  6. 4
      src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
  7. 8
      src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
  8. 2
      src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs
  9. 2
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
  10. 2
      tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
  11. 12
      tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs

4
src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

@ -1012,9 +1012,9 @@ internal static partial class SimdUtils
Unsafe.Add(ref destinationBase, i) = b; Unsafe.Add(ref destinationBase, i) = b;
} }
} }
else if (Sse2.IsSupported || AdvSimd.IsSupported) else if (Vector128.IsHardwareAccelerated)
{ {
// Sse, AdvSimd // Sse, AdvSimd, etc.
DebugVerifySpanInput(source, destination, Vector128<byte>.Count); DebugVerifySpanInput(source, destination, Vector128<byte>.Count);
nuint n = destination.Vector128Count<byte>(); nuint n = destination.Vector128Count<byte>();

37
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

@ -6,6 +6,7 @@ using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices; using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics; using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.Wasm;
using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Common.Helpers;
@ -270,8 +271,16 @@ internal static class Vector128Utilities
return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right); return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right);
} }
ThrowUnreachableException(); if (PackedSimd.IsSupported)
return default; {
return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right);
}
Vector128<short> min = Vector128.Create((short)byte.MinValue);
Vector128<short> max = Vector128.Create((short)byte.MaxValue);
Vector128<ushort> lefClamped = Clamp(left, min, max).AsUInt16();
Vector128<ushort> rightClamped = Clamp(right, min, max).AsUInt16();
return Vector128.Narrow(lefClamped, rightClamped);
} }
/// <summary> /// <summary>
@ -293,10 +302,30 @@ internal static class Vector128Utilities
return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right); return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right);
} }
ThrowUnreachableException(); if (PackedSimd.IsSupported)
return default; {
return PackedSimd.ConvertNarrowingSaturateSigned(left, right);
}
Vector128<int> min = Vector128.Create((int)short.MinValue);
Vector128<int> max = Vector128.Create((int)short.MaxValue);
Vector128<int> lefClamped = Clamp(left, min, max);
Vector128<int> rightClamped = Clamp(right, min, max);
return Vector128.Narrow(lefClamped, rightClamped);
} }
/// <summary
/// >Restricts a vector between a minimum and a maximum value.
/// </summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="value">The vector to restrict.</param>
/// <param name="min">The minimum value.</param>
/// <param name="max">The maximum value.</param>
/// <returns>The restricted <see cref="Vector128{T}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<T> Clamp<T>(Vector128<T> value, Vector128<T> min, Vector128<T> max)
=> Vector128.Min(Vector128.Max(value, min), max);
[DoesNotReturn] [DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException(); private static void ThrowUnreachableException() => throw new UnreachableException();
} }

1
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs

@ -64,6 +64,7 @@ internal partial struct Block8x8F
ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest); ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
// TODO: We can use the v128 utilities for this.
for (nuint i = 0; i < 16; i += 2) for (nuint i = 0; i < 16; i += 2)
{ {
Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));

45
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs → src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs

@ -5,7 +5,6 @@ using System.Numerics;
using System.Runtime.CompilerServices; using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics; using System.Runtime.Intrinsics;
// <auto-generated />
namespace SixLabors.ImageSharp.Formats.Jpeg.Components; namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal partial struct Block8x8F internal partial struct Block8x8F
@ -13,28 +12,29 @@ internal partial struct Block8x8F
/// <summary> /// <summary>
/// Level shift by +maximum/2, clip to [0, maximum] /// Level shift by +maximum/2, clip to [0, maximum]
/// </summary> /// </summary>
/// <param name="maximum">The maximum value to normalize to.</param>
public void NormalizeColorsInPlace(float maximum) public void NormalizeColorsInPlace(float maximum)
{ {
var CMin4 = new Vector4(0F); Vector4 min = Vector4.Zero;
var CMax4 = new Vector4(maximum); Vector4 max = new(maximum);
var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F)); Vector4 off = new(MathF.Ceiling(maximum * 0.5F));
this.V0L = Numerics.Clamp(this.V0L + COff4, CMin4, CMax4); this.V0L = Vector4.Clamp(this.V0L + off, min, max);
this.V0R = Numerics.Clamp(this.V0R + COff4, CMin4, CMax4); this.V0R = Vector4.Clamp(this.V0R + off, min, max);
this.V1L = Numerics.Clamp(this.V1L + COff4, CMin4, CMax4); this.V1L = Vector4.Clamp(this.V1L + off, min, max);
this.V1R = Numerics.Clamp(this.V1R + COff4, CMin4, CMax4); this.V1R = Vector4.Clamp(this.V1R + off, min, max);
this.V2L = Numerics.Clamp(this.V2L + COff4, CMin4, CMax4); this.V2L = Vector4.Clamp(this.V2L + off, min, max);
this.V2R = Numerics.Clamp(this.V2R + COff4, CMin4, CMax4); this.V2R = Vector4.Clamp(this.V2R + off, min, max);
this.V3L = Numerics.Clamp(this.V3L + COff4, CMin4, CMax4); this.V3L = Vector4.Clamp(this.V3L + off, min, max);
this.V3R = Numerics.Clamp(this.V3R + COff4, CMin4, CMax4); this.V3R = Vector4.Clamp(this.V3R + off, min, max);
this.V4L = Numerics.Clamp(this.V4L + COff4, CMin4, CMax4); this.V4L = Vector4.Clamp(this.V4L + off, min, max);
this.V4R = Numerics.Clamp(this.V4R + COff4, CMin4, CMax4); this.V4R = Vector4.Clamp(this.V4R + off, min, max);
this.V5L = Numerics.Clamp(this.V5L + COff4, CMin4, CMax4); this.V5L = Vector4.Clamp(this.V5L + off, min, max);
this.V5R = Numerics.Clamp(this.V5R + COff4, CMin4, CMax4); this.V5R = Vector4.Clamp(this.V5R + off, min, max);
this.V6L = Numerics.Clamp(this.V6L + COff4, CMin4, CMax4); this.V6L = Vector4.Clamp(this.V6L + off, min, max);
this.V6R = Numerics.Clamp(this.V6R + COff4, CMin4, CMax4); this.V6R = Vector4.Clamp(this.V6R + off, min, max);
this.V7L = Numerics.Clamp(this.V7L + COff4, CMin4, CMax4); this.V7L = Vector4.Clamp(this.V7L + off, min, max);
this.V7R = Numerics.Clamp(this.V7R + COff4, CMin4, CMax4); this.V7R = Vector4.Clamp(this.V7R + off, min, max);
} }
/// <summary> /// <summary>
@ -44,7 +44,7 @@ internal partial struct Block8x8F
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public void NormalizeColorsAndRoundInPlaceVector256(float maximum) public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
{ {
Vector256<float> off = Vector256.Create(MathF.Ceiling(maximum * 0.5F)); Vector256<float> off = Vector256.Create(MathF.Ceiling(maximum * 0.5F));
Vector256<float> max = Vector256.Create(maximum); Vector256<float> max = Vector256.Create(maximum);
ref Vector256<float> row0 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V0L); ref Vector256<float> row0 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V0L);
@ -103,6 +103,7 @@ internal partial struct Block8x8F
/// <summary> /// <summary>
/// Fill the block from 'source' doing short -> float conversion. /// Fill the block from 'source' doing short -> float conversion.
/// </summary> /// </summary>
/// <param name="source">The source block</param>
public void LoadFromInt16Scalar(ref Block8x8 source) public void LoadFromInt16Scalar(ref Block8x8 source)
{ {
ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source); ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);

67
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs

@ -159,17 +159,18 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public void MultiplyInPlace(float value) public void MultiplyInPlace(float value)
{ {
if (Avx.IsSupported) // TODO: Vector512
if (Vector256.IsHardwareAccelerated)
{ {
Vector256<float> valueVec = Vector256.Create(value); Vector256<float> valueVec = Vector256.Create(value);
this.V0 = Avx.Multiply(this.V0, valueVec); this.V0 *= valueVec;
this.V1 = Avx.Multiply(this.V1, valueVec); this.V1 *= valueVec;
this.V2 = Avx.Multiply(this.V2, valueVec); this.V2 *= valueVec;
this.V3 = Avx.Multiply(this.V3, valueVec); this.V3 *= valueVec;
this.V4 = Avx.Multiply(this.V4, valueVec); this.V4 *= valueVec;
this.V5 = Avx.Multiply(this.V5, valueVec); this.V5 *= valueVec;
this.V6 = Avx.Multiply(this.V6, valueVec); this.V6 *= valueVec;
this.V7 = Avx.Multiply(this.V7, valueVec); this.V7 *= valueVec;
} }
else else
{ {
@ -200,16 +201,17 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public unsafe void MultiplyInPlace(ref Block8x8F other) public unsafe void MultiplyInPlace(ref Block8x8F other)
{ {
if (Avx.IsSupported) // TODO: Vector512
if (Vector256.IsHardwareAccelerated)
{ {
this.V0 = Avx.Multiply(this.V0, other.V0); this.V0 *= other.V0;
this.V1 = Avx.Multiply(this.V1, other.V1); this.V1 *= other.V1;
this.V2 = Avx.Multiply(this.V2, other.V2); this.V2 *= other.V2;
this.V3 = Avx.Multiply(this.V3, other.V3); this.V3 *= other.V3;
this.V4 = Avx.Multiply(this.V4, other.V4); this.V4 *= other.V4;
this.V5 = Avx.Multiply(this.V5, other.V5); this.V5 *= other.V5;
this.V6 = Avx.Multiply(this.V6, other.V6); this.V6 *= other.V6;
this.V7 = Avx.Multiply(this.V7, other.V7); this.V7 *= other.V7;
} }
else else
{ {
@ -239,17 +241,18 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public void AddInPlace(float value) public void AddInPlace(float value)
{ {
if (Avx.IsSupported) // TODO: Vector512
if (Vector256.IsHardwareAccelerated)
{ {
Vector256<float> valueVec = Vector256.Create(value); Vector256<float> valueVec = Vector256.Create(value);
this.V0 = Avx.Add(this.V0, valueVec); this.V0 += valueVec;
this.V1 = Avx.Add(this.V1, valueVec); this.V1 += valueVec;
this.V2 = Avx.Add(this.V2, valueVec); this.V2 += valueVec;
this.V3 = Avx.Add(this.V3, valueVec); this.V3 += valueVec;
this.V4 = Avx.Add(this.V4, valueVec); this.V4 += valueVec;
this.V5 = Avx.Add(this.V5, valueVec); this.V5 += valueVec;
this.V6 = Avx.Add(this.V6, valueVec); this.V6 += valueVec;
this.V7 = Avx.Add(this.V7, valueVec); this.V7 += valueVec;
} }
else else
{ {
@ -509,10 +512,10 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
} }
/// <summary> /// <summary>
/// Transpose the block inplace. /// Transpose the block in-place.
/// </summary> /// </summary>
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public void TransposeInplace() public void TransposeInPlace()
{ {
if (Avx.IsSupported) if (Avx.IsSupported)
{ {
@ -520,15 +523,15 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
} }
else else
{ {
this.TransposeInplace_Scalar(); this.TransposeInPlace_Scalar();
} }
} }
/// <summary> /// <summary>
/// Scalar inplace transpose implementation for <see cref="TransposeInplace"/> /// Scalar in-place transpose implementation for <see cref="TransposeInPlace"/>
/// </summary> /// </summary>
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
private void TransposeInplace_Scalar() private void TransposeInPlace_Scalar()
{ {
ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref this); ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref this);

4
src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs

@ -20,7 +20,7 @@ internal static partial class FloatingPointDCT
FDCT8x8_1D_Avx(ref block); FDCT8x8_1D_Avx(ref block);
// Second pass - process rows // Second pass - process rows
block.TransposeInplace(); block.TransposeInPlace();
FDCT8x8_1D_Avx(ref block); FDCT8x8_1D_Avx(ref block);
// Applies 1D floating point FDCT inplace // Applies 1D floating point FDCT inplace
@ -81,7 +81,7 @@ internal static partial class FloatingPointDCT
IDCT8x8_1D_Avx(ref transposedBlock); IDCT8x8_1D_Avx(ref transposedBlock);
// Second pass - process rows // Second pass - process rows
transposedBlock.TransposeInplace(); transposedBlock.TransposeInPlace();
IDCT8x8_1D_Avx(ref transposedBlock); IDCT8x8_1D_Avx(ref transposedBlock);
// Applies 1D floating point FDCT inplace // Applies 1D floating point FDCT inplace

8
src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs

@ -77,7 +77,7 @@ internal static partial class FloatingPointDCT
// Spectral macroblocks are transposed before quantization // Spectral macroblocks are transposed before quantization
// so we must transpose quantization table // so we must transpose quantization table
quantTable.TransposeInplace(); quantTable.TransposeInPlace();
} }
/// <summary> /// <summary>
@ -97,7 +97,7 @@ internal static partial class FloatingPointDCT
// Spectral macroblocks are not transposed before quantization // Spectral macroblocks are not transposed before quantization
// Transpose is done after quantization at zig-zag stage // Transpose is done after quantization at zig-zag stage
// so we must transpose quantization table // so we must transpose quantization table
quantTable.TransposeInplace(); quantTable.TransposeInPlace();
} }
/// <summary> /// <summary>
@ -155,7 +155,7 @@ internal static partial class FloatingPointDCT
IDCT8x4_Vector4(ref transposedBlock.V0R); IDCT8x4_Vector4(ref transposedBlock.V0R);
// Second pass - process rows // Second pass - process rows
transposedBlock.TransposeInplace(); transposedBlock.TransposeInPlace();
IDCT8x4_Vector4(ref transposedBlock.V0L); IDCT8x4_Vector4(ref transposedBlock.V0L);
IDCT8x4_Vector4(ref transposedBlock.V0R); IDCT8x4_Vector4(ref transposedBlock.V0R);
@ -225,7 +225,7 @@ internal static partial class FloatingPointDCT
FDCT8x4_Vector4(ref block.V0R); FDCT8x4_Vector4(ref block.V0R);
// Second pass - process rows // Second pass - process rows
block.TransposeInplace(); block.TransposeInPlace();
FDCT8x4_Vector4(ref block.V0L); FDCT8x4_Vector4(ref block.V0L);
FDCT8x4_Vector4(ref block.V0R); FDCT8x4_Vector4(ref block.V0R);

2
src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs

@ -48,7 +48,7 @@ internal static class ScaledFloatingPointDCT
// Spectral macroblocks are transposed before quantization // Spectral macroblocks are transposed before quantization
// so we must transpose quantization table // so we must transpose quantization table
quantTable.TransposeInplace(); quantTable.TransposeInPlace();
} }
/// <summary> /// <summary>

2
tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs

@ -14,7 +14,7 @@ public class Block8x8F_Transpose
[Benchmark] [Benchmark]
public float TransposeInplace() public float TransposeInplace()
{ {
this.source.TransposeInplace(); this.source.TransposeInPlace();
return this.source[0]; return this.source[0];
} }

2
tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs

@ -130,7 +130,7 @@ public partial class Block8x8FTests : JpegFixture
Block8x8F block8x8 = Block8x8F.Load(Create8x8FloatData()); Block8x8F block8x8 = Block8x8F.Load(Create8x8FloatData());
block8x8.TransposeInplace(); block8x8.TransposeInPlace();
float[] actual = new float[64]; float[] actual = new float[64];
block8x8.ScaledCopyTo(actual); block8x8.ScaledCopyTo(actual);

12
tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs

@ -62,7 +62,7 @@ public static class DCTTests
FloatingPointDCT.AdjustToIDCT(ref dequantMatrix); FloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
// IDCT implementation tranforms blocks after transposition // IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace(); srcBlock.TransposeInPlace();
srcBlock.MultiplyInPlace(ref dequantMatrix); srcBlock.MultiplyInPlace(ref dequantMatrix);
// IDCT calculation // IDCT calculation
@ -95,7 +95,7 @@ public static class DCTTests
FloatingPointDCT.AdjustToIDCT(ref dequantMatrix); FloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
// IDCT implementation tranforms blocks after transposition // IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace(); srcBlock.TransposeInPlace();
srcBlock.MultiplyInPlace(ref dequantMatrix); srcBlock.MultiplyInPlace(ref dequantMatrix);
// IDCT calculation // IDCT calculation
@ -136,7 +136,7 @@ public static class DCTTests
// testee // testee
// IDCT implementation tranforms blocks after transposition // IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace(); srcBlock.TransposeInPlace();
FloatingPointDCT.TransformIDCT(ref srcBlock); FloatingPointDCT.TransformIDCT(ref srcBlock);
float[] actualDest = srcBlock.ToArray(); float[] actualDest = srcBlock.ToArray();
@ -182,7 +182,7 @@ public static class DCTTests
// testee // testee
// IDCT implementation tranforms blocks after transposition // IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace(); srcBlock.TransposeInPlace();
ScaledFloatingPointDCT.TransformIDCT_4x4(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue); ScaledFloatingPointDCT.TransformIDCT_4x4(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue);
Span<float> expectedSpan = expectedDest.AsSpan(); Span<float> expectedSpan = expectedDest.AsSpan();
@ -243,7 +243,7 @@ public static class DCTTests
// testee // testee
// IDCT implementation tranforms blocks after transposition // IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace(); srcBlock.TransposeInPlace();
ScaledFloatingPointDCT.TransformIDCT_2x2(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue); ScaledFloatingPointDCT.TransformIDCT_2x2(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue);
Span<float> expectedSpan = expectedDest.AsSpan(); Span<float> expectedSpan = expectedDest.AsSpan();
@ -338,7 +338,7 @@ public static class DCTTests
// Second transpose call is done by Quantize step // Second transpose call is done by Quantize step
// Do this manually here just to be complient to the reference implementation // Do this manually here just to be complient to the reference implementation
FloatingPointDCT.TransformFDCT(ref block); FloatingPointDCT.TransformFDCT(ref block);
block.TransposeInplace(); block.TransposeInPlace();
// Part of the IDCT calculations is fused into the quantization step // Part of the IDCT calculations is fused into the quantization step
// We must multiply input block with adjusted no-quantization matrix // We must multiply input block with adjusted no-quantization matrix

Loading…
Cancel
Save