diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index a0733b6607..372fff08cd 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -1012,9 +1012,9 @@ internal static partial class SimdUtils Unsafe.Add(ref destinationBase, i) = b; } } - else if (Sse2.IsSupported || AdvSimd.IsSupported) + else if (Vector128.IsHardwareAccelerated) { - // Sse, AdvSimd + // Sse, AdvSimd, etc. DebugVerifySpanInput(source, destination, Vector128.Count); nuint n = destination.Vector128Count(); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 7657379061..e99eecc42c 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -6,6 +6,7 @@ using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; namespace SixLabors.ImageSharp.Common.Helpers; @@ -270,8 +271,16 @@ internal static class Vector128Utilities return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right); } - ThrowUnreachableException(); - return default; + if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right); + } + + Vector128 min = Vector128.Create((short)byte.MinValue); + Vector128 max = Vector128.Create((short)byte.MaxValue); + Vector128 lefClamped = Clamp(left, min, max).AsUInt16(); + Vector128 rightClamped = Clamp(right, min, max).AsUInt16(); + return Vector128.Narrow(lefClamped, rightClamped); } /// @@ -293,10 +302,30 @@ internal static class Vector128Utilities return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right); } - ThrowUnreachableException(); - return default; + if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertNarrowingSaturateSigned(left, right); + } + + Vector128 min = Vector128.Create((int)short.MinValue); + Vector128 max = Vector128.Create((int)short.MaxValue); + Vector128 lefClamped = Clamp(left, min, max); + Vector128 rightClamped = Clamp(right, min, max); + return Vector128.Narrow(lefClamped, rightClamped); } + /// Restricts a vector between a minimum and a maximum value. + /// + /// The type of the elements in the vector. + /// The vector to restrict. + /// The minimum value. + /// The maximum value. + /// The restricted . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Clamp(Vector128 value, Vector128 min, Vector128 max) + => Vector128.Min(Vector128.Max(value, min), max); + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs index 63be76f00f..3921eccb7d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -64,6 +64,7 @@ internal partial struct Block8x8F ref Vector128 destBase = ref Unsafe.As>(ref dest); + // TODO: We can use the v128 utilities for this. for (nuint i = 0; i < 16; i += 2) { Vector128 left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs similarity index 85% rename from src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs rename to src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs index 5954ad3251..899a883e4e 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs @@ -5,7 +5,6 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; -// namespace SixLabors.ImageSharp.Formats.Jpeg.Components; internal partial struct Block8x8F @@ -13,28 +12,29 @@ internal partial struct Block8x8F /// /// Level shift by +maximum/2, clip to [0, maximum] /// + /// The maximum value to normalize to. public void NormalizeColorsInPlace(float maximum) { - var CMin4 = new Vector4(0F); - var CMax4 = new Vector4(maximum); - var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F)); - - this.V0L = Numerics.Clamp(this.V0L + COff4, CMin4, CMax4); - this.V0R = Numerics.Clamp(this.V0R + COff4, CMin4, CMax4); - this.V1L = Numerics.Clamp(this.V1L + COff4, CMin4, CMax4); - this.V1R = Numerics.Clamp(this.V1R + COff4, CMin4, CMax4); - this.V2L = Numerics.Clamp(this.V2L + COff4, CMin4, CMax4); - this.V2R = Numerics.Clamp(this.V2R + COff4, CMin4, CMax4); - this.V3L = Numerics.Clamp(this.V3L + COff4, CMin4, CMax4); - this.V3R = Numerics.Clamp(this.V3R + COff4, CMin4, CMax4); - this.V4L = Numerics.Clamp(this.V4L + COff4, CMin4, CMax4); - this.V4R = Numerics.Clamp(this.V4R + COff4, CMin4, CMax4); - this.V5L = Numerics.Clamp(this.V5L + COff4, CMin4, CMax4); - this.V5R = Numerics.Clamp(this.V5R + COff4, CMin4, CMax4); - this.V6L = Numerics.Clamp(this.V6L + COff4, CMin4, CMax4); - this.V6R = Numerics.Clamp(this.V6R + COff4, CMin4, CMax4); - this.V7L = Numerics.Clamp(this.V7L + COff4, CMin4, CMax4); - this.V7R = Numerics.Clamp(this.V7R + COff4, CMin4, CMax4); + Vector4 min = Vector4.Zero; + Vector4 max = new(maximum); + Vector4 off = new(MathF.Ceiling(maximum * 0.5F)); + + this.V0L = Vector4.Clamp(this.V0L + off, min, max); + this.V0R = Vector4.Clamp(this.V0R + off, min, max); + this.V1L = Vector4.Clamp(this.V1L + off, min, max); + this.V1R = Vector4.Clamp(this.V1R + off, min, max); + this.V2L = Vector4.Clamp(this.V2L + off, min, max); + this.V2R = Vector4.Clamp(this.V2R + off, min, max); + this.V3L = Vector4.Clamp(this.V3L + off, min, max); + this.V3R = Vector4.Clamp(this.V3R + off, min, max); + this.V4L = Vector4.Clamp(this.V4L + off, min, max); + this.V4R = Vector4.Clamp(this.V4R + off, min, max); + this.V5L = Vector4.Clamp(this.V5L + off, min, max); + this.V5R = Vector4.Clamp(this.V5R + off, min, max); + this.V6L = Vector4.Clamp(this.V6L + off, min, max); + this.V6R = Vector4.Clamp(this.V6R + off, min, max); + this.V7L = Vector4.Clamp(this.V7L + off, min, max); + this.V7R = Vector4.Clamp(this.V7R + off, min, max); } /// @@ -44,7 +44,7 @@ internal partial struct Block8x8F [MethodImpl(InliningOptions.ShortMethod)] public void NormalizeColorsAndRoundInPlaceVector256(float maximum) { - Vector256 off = Vector256.Create(MathF.Ceiling(maximum * 0.5F)); + Vector256 off = Vector256.Create(MathF.Ceiling(maximum * 0.5F)); Vector256 max = Vector256.Create(maximum); ref Vector256 row0 = ref Unsafe.As>(ref this.V0L); @@ -103,6 +103,7 @@ internal partial struct Block8x8F /// /// Fill the block from 'source' doing short -> float conversion. /// + /// The source block public void LoadFromInt16Scalar(ref Block8x8 source) { ref short selfRef = ref Unsafe.As(ref source); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 7aa1fb296b..2eecafc136 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -159,17 +159,18 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public void MultiplyInPlace(float value) { - if (Avx.IsSupported) + // TODO: Vector512 + if (Vector256.IsHardwareAccelerated) { Vector256 valueVec = Vector256.Create(value); - this.V0 = Avx.Multiply(this.V0, valueVec); - this.V1 = Avx.Multiply(this.V1, valueVec); - this.V2 = Avx.Multiply(this.V2, valueVec); - this.V3 = Avx.Multiply(this.V3, valueVec); - this.V4 = Avx.Multiply(this.V4, valueVec); - this.V5 = Avx.Multiply(this.V5, valueVec); - this.V6 = Avx.Multiply(this.V6, valueVec); - this.V7 = Avx.Multiply(this.V7, valueVec); + this.V0 *= valueVec; + this.V1 *= valueVec; + this.V2 *= valueVec; + this.V3 *= valueVec; + this.V4 *= valueVec; + this.V5 *= valueVec; + this.V6 *= valueVec; + this.V7 *= valueVec; } else { @@ -200,16 +201,17 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public unsafe void MultiplyInPlace(ref Block8x8F other) { - if (Avx.IsSupported) + // TODO: Vector512 + if (Vector256.IsHardwareAccelerated) { - this.V0 = Avx.Multiply(this.V0, other.V0); - this.V1 = Avx.Multiply(this.V1, other.V1); - this.V2 = Avx.Multiply(this.V2, other.V2); - this.V3 = Avx.Multiply(this.V3, other.V3); - this.V4 = Avx.Multiply(this.V4, other.V4); - this.V5 = Avx.Multiply(this.V5, other.V5); - this.V6 = Avx.Multiply(this.V6, other.V6); - this.V7 = Avx.Multiply(this.V7, other.V7); + this.V0 *= other.V0; + this.V1 *= other.V1; + this.V2 *= other.V2; + this.V3 *= other.V3; + this.V4 *= other.V4; + this.V5 *= other.V5; + this.V6 *= other.V6; + this.V7 *= other.V7; } else { @@ -239,17 +241,18 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public void AddInPlace(float value) { - if (Avx.IsSupported) + // TODO: Vector512 + if (Vector256.IsHardwareAccelerated) { Vector256 valueVec = Vector256.Create(value); - this.V0 = Avx.Add(this.V0, valueVec); - this.V1 = Avx.Add(this.V1, valueVec); - this.V2 = Avx.Add(this.V2, valueVec); - this.V3 = Avx.Add(this.V3, valueVec); - this.V4 = Avx.Add(this.V4, valueVec); - this.V5 = Avx.Add(this.V5, valueVec); - this.V6 = Avx.Add(this.V6, valueVec); - this.V7 = Avx.Add(this.V7, valueVec); + this.V0 += valueVec; + this.V1 += valueVec; + this.V2 += valueVec; + this.V3 += valueVec; + this.V4 += valueVec; + this.V5 += valueVec; + this.V6 += valueVec; + this.V7 += valueVec; } else { @@ -509,10 +512,10 @@ internal partial struct Block8x8F : IEquatable } /// - /// Transpose the block inplace. + /// Transpose the block in-place. /// [MethodImpl(InliningOptions.ShortMethod)] - public void TransposeInplace() + public void TransposeInPlace() { if (Avx.IsSupported) { @@ -520,15 +523,15 @@ internal partial struct Block8x8F : IEquatable } else { - this.TransposeInplace_Scalar(); + this.TransposeInPlace_Scalar(); } } /// - /// Scalar inplace transpose implementation for + /// Scalar in-place transpose implementation for /// [MethodImpl(InliningOptions.ShortMethod)] - private void TransposeInplace_Scalar() + private void TransposeInPlace_Scalar() { ref float elemRef = ref Unsafe.As(ref this); diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs index 7e102f696d..b11d834a8c 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs @@ -20,7 +20,7 @@ internal static partial class FloatingPointDCT FDCT8x8_1D_Avx(ref block); // Second pass - process rows - block.TransposeInplace(); + block.TransposeInPlace(); FDCT8x8_1D_Avx(ref block); // Applies 1D floating point FDCT inplace @@ -81,7 +81,7 @@ internal static partial class FloatingPointDCT IDCT8x8_1D_Avx(ref transposedBlock); // Second pass - process rows - transposedBlock.TransposeInplace(); + transposedBlock.TransposeInPlace(); IDCT8x8_1D_Avx(ref transposedBlock); // Applies 1D floating point FDCT inplace diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs index 0aca33b4c9..4c22307cfe 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs @@ -77,7 +77,7 @@ internal static partial class FloatingPointDCT // Spectral macroblocks are transposed before quantization // so we must transpose quantization table - quantTable.TransposeInplace(); + quantTable.TransposeInPlace(); } /// @@ -97,7 +97,7 @@ internal static partial class FloatingPointDCT // Spectral macroblocks are not transposed before quantization // Transpose is done after quantization at zig-zag stage // so we must transpose quantization table - quantTable.TransposeInplace(); + quantTable.TransposeInPlace(); } /// @@ -155,7 +155,7 @@ internal static partial class FloatingPointDCT IDCT8x4_Vector4(ref transposedBlock.V0R); // Second pass - process rows - transposedBlock.TransposeInplace(); + transposedBlock.TransposeInPlace(); IDCT8x4_Vector4(ref transposedBlock.V0L); IDCT8x4_Vector4(ref transposedBlock.V0R); @@ -225,7 +225,7 @@ internal static partial class FloatingPointDCT FDCT8x4_Vector4(ref block.V0R); // Second pass - process rows - block.TransposeInplace(); + block.TransposeInPlace(); FDCT8x4_Vector4(ref block.V0L); FDCT8x4_Vector4(ref block.V0R); diff --git a/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs index 98e3857973..b8234ff3e4 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs @@ -48,7 +48,7 @@ internal static class ScaledFloatingPointDCT // Spectral macroblocks are transposed before quantization // so we must transpose quantization table - quantTable.TransposeInplace(); + quantTable.TransposeInPlace(); } /// diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs index 07907f21d7..caca630bc2 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -14,7 +14,7 @@ public class Block8x8F_Transpose [Benchmark] public float TransposeInplace() { - this.source.TransposeInplace(); + this.source.TransposeInPlace(); return this.source[0]; } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index 4d804f646e..7b73c0c522 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -130,7 +130,7 @@ public partial class Block8x8FTests : JpegFixture Block8x8F block8x8 = Block8x8F.Load(Create8x8FloatData()); - block8x8.TransposeInplace(); + block8x8.TransposeInPlace(); float[] actual = new float[64]; block8x8.ScaledCopyTo(actual); diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs index 5a1488c411..7b411a28fe 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs @@ -62,7 +62,7 @@ public static class DCTTests FloatingPointDCT.AdjustToIDCT(ref dequantMatrix); // IDCT implementation tranforms blocks after transposition - srcBlock.TransposeInplace(); + srcBlock.TransposeInPlace(); srcBlock.MultiplyInPlace(ref dequantMatrix); // IDCT calculation @@ -95,7 +95,7 @@ public static class DCTTests FloatingPointDCT.AdjustToIDCT(ref dequantMatrix); // IDCT implementation tranforms blocks after transposition - srcBlock.TransposeInplace(); + srcBlock.TransposeInPlace(); srcBlock.MultiplyInPlace(ref dequantMatrix); // IDCT calculation @@ -136,7 +136,7 @@ public static class DCTTests // testee // IDCT implementation tranforms blocks after transposition - srcBlock.TransposeInplace(); + srcBlock.TransposeInPlace(); FloatingPointDCT.TransformIDCT(ref srcBlock); float[] actualDest = srcBlock.ToArray(); @@ -182,7 +182,7 @@ public static class DCTTests // testee // IDCT implementation tranforms blocks after transposition - srcBlock.TransposeInplace(); + srcBlock.TransposeInPlace(); ScaledFloatingPointDCT.TransformIDCT_4x4(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue); Span expectedSpan = expectedDest.AsSpan(); @@ -243,7 +243,7 @@ public static class DCTTests // testee // IDCT implementation tranforms blocks after transposition - srcBlock.TransposeInplace(); + srcBlock.TransposeInPlace(); ScaledFloatingPointDCT.TransformIDCT_2x2(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue); Span expectedSpan = expectedDest.AsSpan(); @@ -338,7 +338,7 @@ public static class DCTTests // Second transpose call is done by Quantize step // Do this manually here just to be complient to the reference implementation FloatingPointDCT.TransformFDCT(ref block); - block.TransposeInplace(); + block.TransposeInPlace(); // Part of the IDCT calculations is fused into the quantization step // We must multiply input block with adjusted no-quantization matrix