From 69caa490e04e1bb29858dfc941175e8bc5391047 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 6 May 2025 20:21:40 +1000 Subject: [PATCH 01/12] Add Vector128 rounding --- .../Jpeg/Components/Block8x8F.Generated.cs | 231 ++++++++++-------- .../Jpeg/Components/Block8x8F.Generated.tt | 103 -------- .../Formats/Jpeg/Components/Block8x8F.cs | 37 ++- src/ImageSharp/ImageSharp.csproj | 18 -- .../Formats/Jpg/Block8x8FTests.cs | 54 +++- 5 files changed, 199 insertions(+), 244 deletions(-) delete mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs index 93bb7be36..5954ad325 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs @@ -3,13 +3,14 @@ using System.Numerics; using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; // namespace SixLabors.ImageSharp.Formats.Jpeg.Components; internal partial struct Block8x8F { - /// + /// /// Level shift by +maximum/2, clip to [0, maximum] /// public void NormalizeColorsInPlace(float maximum) @@ -37,38 +38,66 @@ internal partial struct Block8x8F } /// - /// AVX2-only variant for executing and in one step. + /// version of and . /// + /// The maximum value to normalize to. [MethodImpl(InliningOptions.ShortMethod)] - public void NormalizeColorsAndRoundInPlaceVector8(float maximum) + public void NormalizeColorsAndRoundInPlaceVector256(float maximum) { - var off = new Vector(MathF.Ceiling(maximum * 0.5F)); - var max = new Vector(maximum); - - ref Vector row0 = ref Unsafe.As>(ref this.V0L); - row0 = NormalizeAndRound(row0, off, max); - - ref Vector row1 = ref Unsafe.As>(ref this.V1L); - row1 = NormalizeAndRound(row1, off, max); - - ref Vector row2 = ref Unsafe.As>(ref this.V2L); - row2 = NormalizeAndRound(row2, off, max); - - ref Vector row3 = ref Unsafe.As>(ref this.V3L); - row3 = NormalizeAndRound(row3, off, max); - - ref Vector row4 = ref Unsafe.As>(ref this.V4L); - row4 = NormalizeAndRound(row4, off, max); - - ref Vector row5 = ref Unsafe.As>(ref this.V5L); - row5 = NormalizeAndRound(row5, off, max); - - ref Vector row6 = ref Unsafe.As>(ref this.V6L); - row6 = NormalizeAndRound(row6, off, max); - - ref Vector row7 = ref Unsafe.As>(ref this.V7L); - row7 = NormalizeAndRound(row7, off, max); - + Vector256 off = Vector256.Create(MathF.Ceiling(maximum * 0.5F)); + Vector256 max = Vector256.Create(maximum); + + ref Vector256 row0 = ref Unsafe.As>(ref this.V0L); + row0 = NormalizeAndRoundVector256(row0, off, max); + + ref Vector256 row1 = ref Unsafe.As>(ref this.V1L); + row1 = NormalizeAndRoundVector256(row1, off, max); + + ref Vector256 row2 = ref Unsafe.As>(ref this.V2L); + row2 = NormalizeAndRoundVector256(row2, off, max); + + ref Vector256 row3 = ref Unsafe.As>(ref this.V3L); + row3 = NormalizeAndRoundVector256(row3, off, max); + + ref Vector256 row4 = ref Unsafe.As>(ref this.V4L); + row4 = NormalizeAndRoundVector256(row4, off, max); + + ref Vector256 row5 = ref Unsafe.As>(ref this.V5L); + row5 = NormalizeAndRoundVector256(row5, off, max); + + ref Vector256 row6 = ref Unsafe.As>(ref this.V6L); + row6 = NormalizeAndRoundVector256(row6, off, max); + + ref Vector256 row7 = ref Unsafe.As>(ref this.V7L); + row7 = NormalizeAndRoundVector256(row7, off, max); + } + + /// + /// version of and . + /// + /// The maximum value to normalize to. + [MethodImpl(InliningOptions.ShortMethod)] + public void NormalizeColorsAndRoundInPlaceVector128(float maximum) + { + Vector128 off = Vector128.Create(MathF.Ceiling(maximum * 0.5F)); + Vector128 max = Vector128.Create(maximum); + + this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4(); + this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4(); + this.V1L = NormalizeAndRoundVector128(this.V1L.AsVector128(), off, max).AsVector4(); + this.V1R = NormalizeAndRoundVector128(this.V1R.AsVector128(), off, max).AsVector4(); + this.V2L = NormalizeAndRoundVector128(this.V2L.AsVector128(), off, max).AsVector4(); + this.V2R = NormalizeAndRoundVector128(this.V2R.AsVector128(), off, max).AsVector4(); + this.V3L = NormalizeAndRoundVector128(this.V3L.AsVector128(), off, max).AsVector4(); + this.V3R = NormalizeAndRoundVector128(this.V3R.AsVector128(), off, max).AsVector4(); + this.V4L = NormalizeAndRoundVector128(this.V4L.AsVector128(), off, max).AsVector4(); + this.V4R = NormalizeAndRoundVector128(this.V4R.AsVector128(), off, max).AsVector4(); + this.V5L = NormalizeAndRoundVector128(this.V5L.AsVector128(), off, max).AsVector4(); + this.V5R = NormalizeAndRoundVector128(this.V5R.AsVector128(), off, max).AsVector4(); + this.V6L = NormalizeAndRoundVector128(this.V6L.AsVector128(), off, max).AsVector4(); + this.V6R = NormalizeAndRoundVector128(this.V6R.AsVector128(), off, max).AsVector4(); + this.V7L = NormalizeAndRoundVector128(this.V7L.AsVector128(), off, max).AsVector4(); + this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4(); } /// @@ -78,76 +107,76 @@ internal partial struct Block8x8F { ref short selfRef = ref Unsafe.As(ref source); - this.V0L.X = Unsafe.Add(ref selfRef, 0); - this.V0L.Y = Unsafe.Add(ref selfRef, 1); - this.V0L.Z = Unsafe.Add(ref selfRef, 2); - this.V0L.W = Unsafe.Add(ref selfRef, 3); - this.V0R.X = Unsafe.Add(ref selfRef, 4); - this.V0R.Y = Unsafe.Add(ref selfRef, 5); - this.V0R.Z = Unsafe.Add(ref selfRef, 6); - this.V0R.W = Unsafe.Add(ref selfRef, 7); - - this.V1L.X = Unsafe.Add(ref selfRef, 8); - this.V1L.Y = Unsafe.Add(ref selfRef, 9); - this.V1L.Z = Unsafe.Add(ref selfRef, 10); - this.V1L.W = Unsafe.Add(ref selfRef, 11); - this.V1R.X = Unsafe.Add(ref selfRef, 12); - this.V1R.Y = Unsafe.Add(ref selfRef, 13); - this.V1R.Z = Unsafe.Add(ref selfRef, 14); - this.V1R.W = Unsafe.Add(ref selfRef, 15); - - this.V2L.X = Unsafe.Add(ref selfRef, 16); - this.V2L.Y = Unsafe.Add(ref selfRef, 17); - this.V2L.Z = Unsafe.Add(ref selfRef, 18); - this.V2L.W = Unsafe.Add(ref selfRef, 19); - this.V2R.X = Unsafe.Add(ref selfRef, 20); - this.V2R.Y = Unsafe.Add(ref selfRef, 21); - this.V2R.Z = Unsafe.Add(ref selfRef, 22); - this.V2R.W = Unsafe.Add(ref selfRef, 23); - - this.V3L.X = Unsafe.Add(ref selfRef, 24); - this.V3L.Y = Unsafe.Add(ref selfRef, 25); - this.V3L.Z = Unsafe.Add(ref selfRef, 26); - this.V3L.W = Unsafe.Add(ref selfRef, 27); - this.V3R.X = Unsafe.Add(ref selfRef, 28); - this.V3R.Y = Unsafe.Add(ref selfRef, 29); - this.V3R.Z = Unsafe.Add(ref selfRef, 30); - this.V3R.W = Unsafe.Add(ref selfRef, 31); - - this.V4L.X = Unsafe.Add(ref selfRef, 32); - this.V4L.Y = Unsafe.Add(ref selfRef, 33); - this.V4L.Z = Unsafe.Add(ref selfRef, 34); - this.V4L.W = Unsafe.Add(ref selfRef, 35); - this.V4R.X = Unsafe.Add(ref selfRef, 36); - this.V4R.Y = Unsafe.Add(ref selfRef, 37); - this.V4R.Z = Unsafe.Add(ref selfRef, 38); - this.V4R.W = Unsafe.Add(ref selfRef, 39); - - this.V5L.X = Unsafe.Add(ref selfRef, 40); - this.V5L.Y = Unsafe.Add(ref selfRef, 41); - this.V5L.Z = Unsafe.Add(ref selfRef, 42); - this.V5L.W = Unsafe.Add(ref selfRef, 43); - this.V5R.X = Unsafe.Add(ref selfRef, 44); - this.V5R.Y = Unsafe.Add(ref selfRef, 45); - this.V5R.Z = Unsafe.Add(ref selfRef, 46); - this.V5R.W = Unsafe.Add(ref selfRef, 47); - - this.V6L.X = Unsafe.Add(ref selfRef, 48); - this.V6L.Y = Unsafe.Add(ref selfRef, 49); - this.V6L.Z = Unsafe.Add(ref selfRef, 50); - this.V6L.W = Unsafe.Add(ref selfRef, 51); - this.V6R.X = Unsafe.Add(ref selfRef, 52); - this.V6R.Y = Unsafe.Add(ref selfRef, 53); - this.V6R.Z = Unsafe.Add(ref selfRef, 54); - this.V6R.W = Unsafe.Add(ref selfRef, 55); - - this.V7L.X = Unsafe.Add(ref selfRef, 56); - this.V7L.Y = Unsafe.Add(ref selfRef, 57); - this.V7L.Z = Unsafe.Add(ref selfRef, 58); - this.V7L.W = Unsafe.Add(ref selfRef, 59); - this.V7R.X = Unsafe.Add(ref selfRef, 60); - this.V7R.Y = Unsafe.Add(ref selfRef, 61); - this.V7R.Z = Unsafe.Add(ref selfRef, 62); - this.V7R.W = Unsafe.Add(ref selfRef, 63); + this.V0L.X = Unsafe.Add(ref selfRef, 0); + this.V0L.Y = Unsafe.Add(ref selfRef, 1); + this.V0L.Z = Unsafe.Add(ref selfRef, 2); + this.V0L.W = Unsafe.Add(ref selfRef, 3); + this.V0R.X = Unsafe.Add(ref selfRef, 4); + this.V0R.Y = Unsafe.Add(ref selfRef, 5); + this.V0R.Z = Unsafe.Add(ref selfRef, 6); + this.V0R.W = Unsafe.Add(ref selfRef, 7); + + this.V1L.X = Unsafe.Add(ref selfRef, 8); + this.V1L.Y = Unsafe.Add(ref selfRef, 9); + this.V1L.Z = Unsafe.Add(ref selfRef, 10); + this.V1L.W = Unsafe.Add(ref selfRef, 11); + this.V1R.X = Unsafe.Add(ref selfRef, 12); + this.V1R.Y = Unsafe.Add(ref selfRef, 13); + this.V1R.Z = Unsafe.Add(ref selfRef, 14); + this.V1R.W = Unsafe.Add(ref selfRef, 15); + + this.V2L.X = Unsafe.Add(ref selfRef, 16); + this.V2L.Y = Unsafe.Add(ref selfRef, 17); + this.V2L.Z = Unsafe.Add(ref selfRef, 18); + this.V2L.W = Unsafe.Add(ref selfRef, 19); + this.V2R.X = Unsafe.Add(ref selfRef, 20); + this.V2R.Y = Unsafe.Add(ref selfRef, 21); + this.V2R.Z = Unsafe.Add(ref selfRef, 22); + this.V2R.W = Unsafe.Add(ref selfRef, 23); + + this.V3L.X = Unsafe.Add(ref selfRef, 24); + this.V3L.Y = Unsafe.Add(ref selfRef, 25); + this.V3L.Z = Unsafe.Add(ref selfRef, 26); + this.V3L.W = Unsafe.Add(ref selfRef, 27); + this.V3R.X = Unsafe.Add(ref selfRef, 28); + this.V3R.Y = Unsafe.Add(ref selfRef, 29); + this.V3R.Z = Unsafe.Add(ref selfRef, 30); + this.V3R.W = Unsafe.Add(ref selfRef, 31); + + this.V4L.X = Unsafe.Add(ref selfRef, 32); + this.V4L.Y = Unsafe.Add(ref selfRef, 33); + this.V4L.Z = Unsafe.Add(ref selfRef, 34); + this.V4L.W = Unsafe.Add(ref selfRef, 35); + this.V4R.X = Unsafe.Add(ref selfRef, 36); + this.V4R.Y = Unsafe.Add(ref selfRef, 37); + this.V4R.Z = Unsafe.Add(ref selfRef, 38); + this.V4R.W = Unsafe.Add(ref selfRef, 39); + + this.V5L.X = Unsafe.Add(ref selfRef, 40); + this.V5L.Y = Unsafe.Add(ref selfRef, 41); + this.V5L.Z = Unsafe.Add(ref selfRef, 42); + this.V5L.W = Unsafe.Add(ref selfRef, 43); + this.V5R.X = Unsafe.Add(ref selfRef, 44); + this.V5R.Y = Unsafe.Add(ref selfRef, 45); + this.V5R.Z = Unsafe.Add(ref selfRef, 46); + this.V5R.W = Unsafe.Add(ref selfRef, 47); + + this.V6L.X = Unsafe.Add(ref selfRef, 48); + this.V6L.Y = Unsafe.Add(ref selfRef, 49); + this.V6L.Z = Unsafe.Add(ref selfRef, 50); + this.V6L.W = Unsafe.Add(ref selfRef, 51); + this.V6R.X = Unsafe.Add(ref selfRef, 52); + this.V6R.Y = Unsafe.Add(ref selfRef, 53); + this.V6R.Z = Unsafe.Add(ref selfRef, 54); + this.V6R.W = Unsafe.Add(ref selfRef, 55); + + this.V7L.X = Unsafe.Add(ref selfRef, 56); + this.V7L.Y = Unsafe.Add(ref selfRef, 57); + this.V7L.Z = Unsafe.Add(ref selfRef, 58); + this.V7L.W = Unsafe.Add(ref selfRef, 59); + this.V7R.X = Unsafe.Add(ref selfRef, 60); + this.V7R.Y = Unsafe.Add(ref selfRef, 61); + this.V7R.Z = Unsafe.Add(ref selfRef, 62); + this.V7R.W = Unsafe.Add(ref selfRef, 63); } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt deleted file mode 100644 index 19b795c23..000000000 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt +++ /dev/null @@ -1,103 +0,0 @@ -<# -// Copyright (c) Six Labors. -// Licensed under the Six Labors Split License. -#> -<#@ template debug="false" hostspecific="false" language="C#" #> -<#@ assembly name="System.Core" #> -<#@ import namespace="System.Linq" #> -<#@ import namespace="System.Text" #> -<#@ import namespace="System.Collections.Generic" #> -<#@ output extension=".cs" #> -// Copyright (c) Six Labors. -// Licensed under the Six Labors Split License. - -using System.Numerics; -using System.Runtime.CompilerServices; - -// -<# -char[] coordz = {'X', 'Y', 'Z', 'W'}; -#> -namespace SixLabors.ImageSharp.Formats.Jpeg.Components; - -internal partial struct Block8x8F -{ - /// - /// Level shift by +maximum/2, clip to [0, maximum] - /// - public void NormalizeColorsInPlace(float maximum) - { - var CMin4 = new Vector4(0F); - var CMax4 = new Vector4(maximum); - var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F)); - - <# - - PushIndent(" "); - - for (int i = 0; i < 8; i++) - { - for (int j = 0; j < 2; j++) - { - char side = j == 0 ? 'L' : 'R'; - Write($"this.V{i}{side} = Numerics.Clamp(this.V{i}{side} + COff4, CMin4, CMax4);\r\n"); - } - } - PopIndent(); - #> - } - - /// - /// AVX2-only variant for executing and in one step. - /// - [MethodImpl(InliningOptions.ShortMethod)] - public void NormalizeColorsAndRoundInPlaceVector8(float maximum) - { - var off = new Vector(MathF.Ceiling(maximum * 0.5F)); - var max = new Vector(maximum); - <# - - for (int i = 0; i < 8; i++) - { - #> - - ref Vector row<#=i#> = ref Unsafe.As>(ref this.V<#=i#>L); - row<#=i#> = NormalizeAndRound(row<#=i#>, off, max); - <# - } - #> - - } - - /// - /// Fill the block from 'source' doing short -> float conversion. - /// - public void LoadFromInt16Scalar(ref Block8x8 source) - { - ref short selfRef = ref Unsafe.As(ref source); - - <# - PushIndent(" "); - for (int j = 0; j < 8; j++) - { - for (int i = 0; i < 8; i++) - { - char destCoord = coordz[i % 4]; - char destSide = (i / 4) % 2 == 0 ? 'L' : 'R'; - - if(j > 0 && i == 0){ - WriteLine(""); - } - - char srcCoord = coordz[j % 4]; - char srcSide = (j / 4) % 2 == 0 ? 'L' : 'R'; - - var expression = $"this.V{j}{destSide}.{destCoord} = Unsafe.Add(ref selfRef, {j*8+i});\r\n"; - Write(expression); - - } - } - PopIndent(); - #> - } -} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 018df5f9f..7aa1fb296 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -8,6 +8,8 @@ using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using System.Text; using SixLabors.ImageSharp.Common.Helpers; +using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities; +using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Jpeg.Components; @@ -332,22 +334,13 @@ internal partial struct Block8x8F : IEquatable /// The maximum value. public void NormalizeColorsAndRoundInPlace(float maximum) { - if (SimdUtils.HasVector8) + if (Vector256.IsHardwareAccelerated) { - this.NormalizeColorsAndRoundInPlaceVector8(maximum); - } - else - { - this.NormalizeColorsInPlace(maximum); - this.RoundInPlace(); + this.NormalizeColorsAndRoundInPlaceVector256(maximum); } - } - - public void DE_NormalizeColors(float maximum) - { - if (SimdUtils.HasVector8) + else if (Vector128.IsHardwareAccelerated) { - this.NormalizeColorsAndRoundInPlaceVector8(maximum); + this.NormalizeColorsAndRoundInPlaceVector128(maximum); } else { @@ -590,4 +583,22 @@ internal partial struct Block8x8F : IEquatable row = Vector.Min(row, max); return row.FastRound(); } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector256 NormalizeAndRoundVector256(Vector256 row, Vector256 off, Vector256 max) + { + row += off; + row = Vector256.Max(row, Vector256.Zero); + row = Vector256.Min(row, max); + return Vector256_.RoundToNearestInteger(row); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 NormalizeAndRoundVector128(Vector128 row, Vector128 off, Vector128 max) + { + row += off; + row = Vector128.Max(row, Vector128.Zero); + row = Vector128.Min(row, max); + return Vector128_.RoundToNearestInteger(row); + } } diff --git a/src/ImageSharp/ImageSharp.csproj b/src/ImageSharp/ImageSharp.csproj index 0d36340bf..fde3e94e9 100644 --- a/src/ImageSharp/ImageSharp.csproj +++ b/src/ImageSharp/ImageSharp.csproj @@ -56,16 +56,6 @@ True ImageMetadataExtensions.tt - - True - True - Block8x8F.Generated.tt - - - True - True - Block8x8F.Generated.tt - True True @@ -158,14 +148,6 @@ ImageMetadataExtensions.cs TextTemplatingFileGenerator - - TextTemplatingFileGenerator - Block8x8F.Generated.cs - - - TextTemplatingFileGenerator - Block8x8F.Generated.cs - TextTemplatingFileGenerator Abgr32.PixelOperations.Generated.cs diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index cde9e776b..4d804f646 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -3,6 +3,7 @@ // Uncomment this to turn unit tests into benchmarks: // #define BENCHMARKING +using System.Runtime.Intrinsics; using SixLabors.ImageSharp.Formats.Jpeg.Components; using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils; using SixLabors.ImageSharp.Tests.TestUtilities; @@ -24,11 +25,22 @@ public partial class Block8x8FTests : JpegFixture { } - private bool SkipOnNonAvx2Runner() + private bool SkipOnNonVector256Runner() { - if (!SimdUtils.HasVector8) + if (!Vector256.IsHardwareAccelerated) { - this.Output.WriteLine("AVX2 not supported, skipping!"); + this.Output.WriteLine("Vector256 not supported, skipping!"); + return true; + } + + return false; + } + + private bool SkipOnNonVector128Runner() + { + if (!Vector128.IsHardwareAccelerated) + { + this.Output.WriteLine("Vector128 not supported, skipping!"); return true; } @@ -172,9 +184,33 @@ public partial class Block8x8FTests : JpegFixture [Theory] [InlineData(1)] [InlineData(2)] - public void NormalizeColorsAndRoundAvx2(int seed) + public void NormalizeColorsAndRoundVector256(int seed) + { + if (this.SkipOnNonVector256Runner()) + { + return; + } + + Block8x8F source = CreateRandomFloatBlock(-200, 200, seed); + + Block8x8F expected = source; + expected.NormalizeColorsInPlace(255); + expected.RoundInPlace(); + + Block8x8F actual = source; + actual.NormalizeColorsAndRoundInPlaceVector256(255); + + this.Output.WriteLine(expected.ToString()); + this.Output.WriteLine(actual.ToString()); + this.CompareBlocks(expected, actual, 0); + } + + [Theory] + [InlineData(1)] + [InlineData(2)] + public void NormalizeColorsAndRoundVector128(int seed) { - if (this.SkipOnNonAvx2Runner()) + if (this.SkipOnNonVector128Runner()) { return; } @@ -186,7 +222,7 @@ public partial class Block8x8FTests : JpegFixture expected.RoundInPlace(); Block8x8F actual = source; - actual.NormalizeColorsAndRoundInPlaceVector8(255); + actual.NormalizeColorsAndRoundInPlaceVector128(255); this.Output.WriteLine(expected.ToString()); this.Output.WriteLine(actual.ToString()); @@ -206,7 +242,7 @@ public partial class Block8x8FTests : JpegFixture Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed); // Quantization code is used only in jpeg where it's guaranteed that - // qunatization valus are greater than 1 + // quantization values are greater than 1 // Quantize method supports negative numbers by very small numbers can cause troubles Block8x8F quant = CreateRandomFloatBlock(1, 2000, qtSeed); @@ -345,7 +381,7 @@ public partial class Block8x8FTests : JpegFixture [Fact] public void LoadFromUInt16Scalar() { - if (this.SkipOnNonAvx2Runner()) + if (this.SkipOnNonVector256Runner()) { return; } @@ -366,7 +402,7 @@ public partial class Block8x8FTests : JpegFixture [Fact] public void LoadFromUInt16ExtendedAvx2() { - if (this.SkipOnNonAvx2Runner()) + if (this.SkipOnNonVector256Runner()) { return; } From 29a56350ce6b4e0e0ac623fb58ecfeaa5513ad68 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 7 May 2025 10:20:48 +1000 Subject: [PATCH 02/12] Clean up and prep for Vector512 multiply --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 4 +- .../Common/Helpers/Vector128Utilities.cs | 37 ++++++++-- .../Jpeg/Components/Block8x8F.Intrinsic.cs | 1 + ...ck8x8F.Generated.cs => Block8x8F.Round.cs} | 45 +++++++------ .../Formats/Jpeg/Components/Block8x8F.cs | 67 ++++++++++--------- .../Components/FloatingPointDCT.Intrinsic.cs | 4 +- .../Jpeg/Components/FloatingPointDCT.cs | 8 +-- .../Jpeg/Components/ScaledFloatingPointDCT.cs | 2 +- .../BlockOperations/Block8x8F_Transpose.cs | 2 +- .../Formats/Jpg/Block8x8FTests.cs | 2 +- .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs | 12 ++-- 11 files changed, 109 insertions(+), 75 deletions(-) rename src/ImageSharp/Formats/Jpeg/Components/{Block8x8F.Generated.cs => Block8x8F.Round.cs} (85%) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index a0733b660..372fff08c 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -1012,9 +1012,9 @@ internal static partial class SimdUtils Unsafe.Add(ref destinationBase, i) = b; } } - else if (Sse2.IsSupported || AdvSimd.IsSupported) + else if (Vector128.IsHardwareAccelerated) { - // Sse, AdvSimd + // Sse, AdvSimd, etc. DebugVerifySpanInput(source, destination, Vector128.Count); nuint n = destination.Vector128Count(); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 765737906..e99eecc42 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -6,6 +6,7 @@ using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; namespace SixLabors.ImageSharp.Common.Helpers; @@ -270,8 +271,16 @@ internal static class Vector128Utilities return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right); } - ThrowUnreachableException(); - return default; + if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right); + } + + Vector128 min = Vector128.Create((short)byte.MinValue); + Vector128 max = Vector128.Create((short)byte.MaxValue); + Vector128 lefClamped = Clamp(left, min, max).AsUInt16(); + Vector128 rightClamped = Clamp(right, min, max).AsUInt16(); + return Vector128.Narrow(lefClamped, rightClamped); } /// @@ -293,10 +302,30 @@ internal static class Vector128Utilities return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right); } - ThrowUnreachableException(); - return default; + if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertNarrowingSaturateSigned(left, right); + } + + Vector128 min = Vector128.Create((int)short.MinValue); + Vector128 max = Vector128.Create((int)short.MaxValue); + Vector128 lefClamped = Clamp(left, min, max); + Vector128 rightClamped = Clamp(right, min, max); + return Vector128.Narrow(lefClamped, rightClamped); } + /// Restricts a vector between a minimum and a maximum value. + /// + /// The type of the elements in the vector. + /// The vector to restrict. + /// The minimum value. + /// The maximum value. + /// The restricted . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Clamp(Vector128 value, Vector128 min, Vector128 max) + => Vector128.Min(Vector128.Max(value, min), max); + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs index 63be76f00..3921eccb7 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -64,6 +64,7 @@ internal partial struct Block8x8F ref Vector128 destBase = ref Unsafe.As>(ref dest); + // TODO: We can use the v128 utilities for this. for (nuint i = 0; i < 16; i += 2) { Vector128 left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs similarity index 85% rename from src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs rename to src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs index 5954ad325..899a883e4 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs @@ -5,7 +5,6 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; -// namespace SixLabors.ImageSharp.Formats.Jpeg.Components; internal partial struct Block8x8F @@ -13,28 +12,29 @@ internal partial struct Block8x8F /// /// Level shift by +maximum/2, clip to [0, maximum] /// + /// The maximum value to normalize to. public void NormalizeColorsInPlace(float maximum) { - var CMin4 = new Vector4(0F); - var CMax4 = new Vector4(maximum); - var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F)); - - this.V0L = Numerics.Clamp(this.V0L + COff4, CMin4, CMax4); - this.V0R = Numerics.Clamp(this.V0R + COff4, CMin4, CMax4); - this.V1L = Numerics.Clamp(this.V1L + COff4, CMin4, CMax4); - this.V1R = Numerics.Clamp(this.V1R + COff4, CMin4, CMax4); - this.V2L = Numerics.Clamp(this.V2L + COff4, CMin4, CMax4); - this.V2R = Numerics.Clamp(this.V2R + COff4, CMin4, CMax4); - this.V3L = Numerics.Clamp(this.V3L + COff4, CMin4, CMax4); - this.V3R = Numerics.Clamp(this.V3R + COff4, CMin4, CMax4); - this.V4L = Numerics.Clamp(this.V4L + COff4, CMin4, CMax4); - this.V4R = Numerics.Clamp(this.V4R + COff4, CMin4, CMax4); - this.V5L = Numerics.Clamp(this.V5L + COff4, CMin4, CMax4); - this.V5R = Numerics.Clamp(this.V5R + COff4, CMin4, CMax4); - this.V6L = Numerics.Clamp(this.V6L + COff4, CMin4, CMax4); - this.V6R = Numerics.Clamp(this.V6R + COff4, CMin4, CMax4); - this.V7L = Numerics.Clamp(this.V7L + COff4, CMin4, CMax4); - this.V7R = Numerics.Clamp(this.V7R + COff4, CMin4, CMax4); + Vector4 min = Vector4.Zero; + Vector4 max = new(maximum); + Vector4 off = new(MathF.Ceiling(maximum * 0.5F)); + + this.V0L = Vector4.Clamp(this.V0L + off, min, max); + this.V0R = Vector4.Clamp(this.V0R + off, min, max); + this.V1L = Vector4.Clamp(this.V1L + off, min, max); + this.V1R = Vector4.Clamp(this.V1R + off, min, max); + this.V2L = Vector4.Clamp(this.V2L + off, min, max); + this.V2R = Vector4.Clamp(this.V2R + off, min, max); + this.V3L = Vector4.Clamp(this.V3L + off, min, max); + this.V3R = Vector4.Clamp(this.V3R + off, min, max); + this.V4L = Vector4.Clamp(this.V4L + off, min, max); + this.V4R = Vector4.Clamp(this.V4R + off, min, max); + this.V5L = Vector4.Clamp(this.V5L + off, min, max); + this.V5R = Vector4.Clamp(this.V5R + off, min, max); + this.V6L = Vector4.Clamp(this.V6L + off, min, max); + this.V6R = Vector4.Clamp(this.V6R + off, min, max); + this.V7L = Vector4.Clamp(this.V7L + off, min, max); + this.V7R = Vector4.Clamp(this.V7R + off, min, max); } /// @@ -44,7 +44,7 @@ internal partial struct Block8x8F [MethodImpl(InliningOptions.ShortMethod)] public void NormalizeColorsAndRoundInPlaceVector256(float maximum) { - Vector256 off = Vector256.Create(MathF.Ceiling(maximum * 0.5F)); + Vector256 off = Vector256.Create(MathF.Ceiling(maximum * 0.5F)); Vector256 max = Vector256.Create(maximum); ref Vector256 row0 = ref Unsafe.As>(ref this.V0L); @@ -103,6 +103,7 @@ internal partial struct Block8x8F /// /// Fill the block from 'source' doing short -> float conversion. /// + /// The source block public void LoadFromInt16Scalar(ref Block8x8 source) { ref short selfRef = ref Unsafe.As(ref source); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 7aa1fb296..2eecafc13 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -159,17 +159,18 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public void MultiplyInPlace(float value) { - if (Avx.IsSupported) + // TODO: Vector512 + if (Vector256.IsHardwareAccelerated) { Vector256 valueVec = Vector256.Create(value); - this.V0 = Avx.Multiply(this.V0, valueVec); - this.V1 = Avx.Multiply(this.V1, valueVec); - this.V2 = Avx.Multiply(this.V2, valueVec); - this.V3 = Avx.Multiply(this.V3, valueVec); - this.V4 = Avx.Multiply(this.V4, valueVec); - this.V5 = Avx.Multiply(this.V5, valueVec); - this.V6 = Avx.Multiply(this.V6, valueVec); - this.V7 = Avx.Multiply(this.V7, valueVec); + this.V0 *= valueVec; + this.V1 *= valueVec; + this.V2 *= valueVec; + this.V3 *= valueVec; + this.V4 *= valueVec; + this.V5 *= valueVec; + this.V6 *= valueVec; + this.V7 *= valueVec; } else { @@ -200,16 +201,17 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public unsafe void MultiplyInPlace(ref Block8x8F other) { - if (Avx.IsSupported) + // TODO: Vector512 + if (Vector256.IsHardwareAccelerated) { - this.V0 = Avx.Multiply(this.V0, other.V0); - this.V1 = Avx.Multiply(this.V1, other.V1); - this.V2 = Avx.Multiply(this.V2, other.V2); - this.V3 = Avx.Multiply(this.V3, other.V3); - this.V4 = Avx.Multiply(this.V4, other.V4); - this.V5 = Avx.Multiply(this.V5, other.V5); - this.V6 = Avx.Multiply(this.V6, other.V6); - this.V7 = Avx.Multiply(this.V7, other.V7); + this.V0 *= other.V0; + this.V1 *= other.V1; + this.V2 *= other.V2; + this.V3 *= other.V3; + this.V4 *= other.V4; + this.V5 *= other.V5; + this.V6 *= other.V6; + this.V7 *= other.V7; } else { @@ -239,17 +241,18 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public void AddInPlace(float value) { - if (Avx.IsSupported) + // TODO: Vector512 + if (Vector256.IsHardwareAccelerated) { Vector256 valueVec = Vector256.Create(value); - this.V0 = Avx.Add(this.V0, valueVec); - this.V1 = Avx.Add(this.V1, valueVec); - this.V2 = Avx.Add(this.V2, valueVec); - this.V3 = Avx.Add(this.V3, valueVec); - this.V4 = Avx.Add(this.V4, valueVec); - this.V5 = Avx.Add(this.V5, valueVec); - this.V6 = Avx.Add(this.V6, valueVec); - this.V7 = Avx.Add(this.V7, valueVec); + this.V0 += valueVec; + this.V1 += valueVec; + this.V2 += valueVec; + this.V3 += valueVec; + this.V4 += valueVec; + this.V5 += valueVec; + this.V6 += valueVec; + this.V7 += valueVec; } else { @@ -509,10 +512,10 @@ internal partial struct Block8x8F : IEquatable } /// - /// Transpose the block inplace. + /// Transpose the block in-place. /// [MethodImpl(InliningOptions.ShortMethod)] - public void TransposeInplace() + public void TransposeInPlace() { if (Avx.IsSupported) { @@ -520,15 +523,15 @@ internal partial struct Block8x8F : IEquatable } else { - this.TransposeInplace_Scalar(); + this.TransposeInPlace_Scalar(); } } /// - /// Scalar inplace transpose implementation for + /// Scalar in-place transpose implementation for /// [MethodImpl(InliningOptions.ShortMethod)] - private void TransposeInplace_Scalar() + private void TransposeInPlace_Scalar() { ref float elemRef = ref Unsafe.As(ref this); diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs index 7e102f696..b11d834a8 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs @@ -20,7 +20,7 @@ internal static partial class FloatingPointDCT FDCT8x8_1D_Avx(ref block); // Second pass - process rows - block.TransposeInplace(); + block.TransposeInPlace(); FDCT8x8_1D_Avx(ref block); // Applies 1D floating point FDCT inplace @@ -81,7 +81,7 @@ internal static partial class FloatingPointDCT IDCT8x8_1D_Avx(ref transposedBlock); // Second pass - process rows - transposedBlock.TransposeInplace(); + transposedBlock.TransposeInPlace(); IDCT8x8_1D_Avx(ref transposedBlock); // Applies 1D floating point FDCT inplace diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs index 0aca33b4c..4c22307cf 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs @@ -77,7 +77,7 @@ internal static partial class FloatingPointDCT // Spectral macroblocks are transposed before quantization // so we must transpose quantization table - quantTable.TransposeInplace(); + quantTable.TransposeInPlace(); } /// @@ -97,7 +97,7 @@ internal static partial class FloatingPointDCT // Spectral macroblocks are not transposed before quantization // Transpose is done after quantization at zig-zag stage // so we must transpose quantization table - quantTable.TransposeInplace(); + quantTable.TransposeInPlace(); } /// @@ -155,7 +155,7 @@ internal static partial class FloatingPointDCT IDCT8x4_Vector4(ref transposedBlock.V0R); // Second pass - process rows - transposedBlock.TransposeInplace(); + transposedBlock.TransposeInPlace(); IDCT8x4_Vector4(ref transposedBlock.V0L); IDCT8x4_Vector4(ref transposedBlock.V0R); @@ -225,7 +225,7 @@ internal static partial class FloatingPointDCT FDCT8x4_Vector4(ref block.V0R); // Second pass - process rows - block.TransposeInplace(); + block.TransposeInPlace(); FDCT8x4_Vector4(ref block.V0L); FDCT8x4_Vector4(ref block.V0R); diff --git a/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs index 98e385797..b8234ff3e 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs @@ -48,7 +48,7 @@ internal static class ScaledFloatingPointDCT // Spectral macroblocks are transposed before quantization // so we must transpose quantization table - quantTable.TransposeInplace(); + quantTable.TransposeInPlace(); } /// diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs index 07907f21d..caca630bc 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -14,7 +14,7 @@ public class Block8x8F_Transpose [Benchmark] public float TransposeInplace() { - this.source.TransposeInplace(); + this.source.TransposeInPlace(); return this.source[0]; } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index 4d804f646..7b73c0c52 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -130,7 +130,7 @@ public partial class Block8x8FTests : JpegFixture Block8x8F block8x8 = Block8x8F.Load(Create8x8FloatData()); - block8x8.TransposeInplace(); + block8x8.TransposeInPlace(); float[] actual = new float[64]; block8x8.ScaledCopyTo(actual); diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs index 5a1488c41..7b411a28f 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs @@ -62,7 +62,7 @@ public static class DCTTests FloatingPointDCT.AdjustToIDCT(ref dequantMatrix); // IDCT implementation tranforms blocks after transposition - srcBlock.TransposeInplace(); + srcBlock.TransposeInPlace(); srcBlock.MultiplyInPlace(ref dequantMatrix); // IDCT calculation @@ -95,7 +95,7 @@ public static class DCTTests FloatingPointDCT.AdjustToIDCT(ref dequantMatrix); // IDCT implementation tranforms blocks after transposition - srcBlock.TransposeInplace(); + srcBlock.TransposeInPlace(); srcBlock.MultiplyInPlace(ref dequantMatrix); // IDCT calculation @@ -136,7 +136,7 @@ public static class DCTTests // testee // IDCT implementation tranforms blocks after transposition - srcBlock.TransposeInplace(); + srcBlock.TransposeInPlace(); FloatingPointDCT.TransformIDCT(ref srcBlock); float[] actualDest = srcBlock.ToArray(); @@ -182,7 +182,7 @@ public static class DCTTests // testee // IDCT implementation tranforms blocks after transposition - srcBlock.TransposeInplace(); + srcBlock.TransposeInPlace(); ScaledFloatingPointDCT.TransformIDCT_4x4(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue); Span expectedSpan = expectedDest.AsSpan(); @@ -243,7 +243,7 @@ public static class DCTTests // testee // IDCT implementation tranforms blocks after transposition - srcBlock.TransposeInplace(); + srcBlock.TransposeInPlace(); ScaledFloatingPointDCT.TransformIDCT_2x2(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue); Span expectedSpan = expectedDest.AsSpan(); @@ -338,7 +338,7 @@ public static class DCTTests // Second transpose call is done by Quantize step // Do this manually here just to be complient to the reference implementation FloatingPointDCT.TransformFDCT(ref block); - block.TransposeInplace(); + block.TransposeInPlace(); // Part of the IDCT calculations is fused into the quantization step // We must multiply input block with adjusted no-quantization matrix From 5125a0480fe2f1249cc578da1336c38359b6430a Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 7 May 2025 13:38:19 +1000 Subject: [PATCH 03/12] Rename utils, organize BlockF8x8 --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 194 +++++++-------- .../Common/Helpers/Vector128Utilities.cs | 8 +- .../Common/Helpers/Vector256Utilities.cs | 16 +- .../Common/Helpers/Vector512Utilities.cs | 30 ++- .../Jpeg/Components/Block8x8F.Intrinsic.cs | 145 ------------ .../Jpeg/Components/Block8x8F.Round.cs | 183 --------------- .../Jpeg/Components/Block8x8F.Vector128.cs | 66 ++++++ .../Jpeg/Components/Block8x8F.Vector256.cs | 191 +++++++++++++++ .../Formats/Jpeg/Components/Block8x8F.cs | 221 ++++++++++-------- .../JpegColorConverter.GrayScaleVector128.cs | 4 +- .../JpegColorConverter.GrayScaleVector256.cs | 4 +- .../JpegColorConverter.GrayScaleVector512.cs | 4 +- .../JpegColorConverter.YCbCrVector128.cs | 4 +- .../JpegColorConverter.YCbCrVector256.cs | 4 +- .../JpegColorConverter.YCbCrVector512.cs | 4 +- .../JpegColorConverter.YccKVector128.cs | 4 +- .../JpegColorConverter.YccKVector256.cs | 4 +- .../JpegColorConverter.YccKVector512.cs | 2 +- .../Components/FloatingPointDCT.Intrinsic.cs | 64 ++--- src/ImageSharp/Formats/Webp/AlphaDecoder.cs | 6 +- .../Codecs/Jpeg/DecodeJpeg.cs | 11 +- .../Config.HwIntrinsics.cs | 33 +++ 22 files changed, 624 insertions(+), 578 deletions(-) delete mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs delete mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 372fff08c..449dc37d0 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -66,9 +66,9 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if ((Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleFloat) || - (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleFloat) || - (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleFloat)) + if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat) || + (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat) || + (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat)) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -112,9 +112,9 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if ((Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleByte) || - (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleByte) || - (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte)) + if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte) || + (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte) || + (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte)) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -158,7 +158,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsRightAlign) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign) { int remainder = source.Length % (Vector128.Count * 3); @@ -190,7 +190,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) { int remainder = source.Length % (Vector128.Count * 3); @@ -223,7 +223,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) { int remainder = source.Length & ((Vector128.Count * 4) - 1); // bit-hack for modulo @@ -249,7 +249,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleFloat) + if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat) { ref Vector512 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector512 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -263,21 +263,21 @@ internal static partial class SimdUtils ref Vector512 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector512 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector512Utilities.Shuffle(vs0, control); - Unsafe.Add(ref vd0, (nuint)1) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); - Unsafe.Add(ref vd0, (nuint)2) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); - Unsafe.Add(ref vd0, (nuint)3) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); + vd0 = Vector512_.Shuffle(vs0, control); + Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); + Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); + Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector512Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control); + Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), control); } } } - else if (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleFloat) + else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat) { ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -291,21 +291,21 @@ internal static partial class SimdUtils ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector256 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector256Utilities.Shuffle(vs0, control); - Unsafe.Add(ref vd0, (nuint)1) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); - Unsafe.Add(ref vd0, (nuint)2) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); - Unsafe.Add(ref vd0, (nuint)3) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); + vd0 = Vector256_.Shuffle(vs0, control); + Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); + Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); + Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector256Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control); + Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), control); } } } - else if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleFloat) + else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat) { ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector128 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -319,17 +319,17 @@ internal static partial class SimdUtils ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector128 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector128Utilities.Shuffle(vs0, control); - Unsafe.Add(ref vd0, (nuint)1) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); - Unsafe.Add(ref vd0, (nuint)2) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); - Unsafe.Add(ref vd0, (nuint)3) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); + vd0 = Vector128_.Shuffle(vs0, control); + Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); + Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); + Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector128Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control); + Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), control); } } } @@ -341,7 +341,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleByte) + if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte) { Span temp = stackalloc byte[Vector512.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -359,21 +359,21 @@ internal static partial class SimdUtils ref Vector512 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector512 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector512Utilities.Shuffle(vs0, mask); - Unsafe.Add(ref vd0, (nuint)1) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); - Unsafe.Add(ref vd0, (nuint)2) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); - Unsafe.Add(ref vd0, (nuint)3) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); + vd0 = Vector512_.Shuffle(vs0, mask); + Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); + Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); + Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector512Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask); + Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), mask); } } } - else if (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleByte) + else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte) { Span temp = stackalloc byte[Vector256.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -391,21 +391,21 @@ internal static partial class SimdUtils ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector256 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector256Utilities.Shuffle(vs0, mask); - Unsafe.Add(ref vd0, (nuint)1) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); - Unsafe.Add(ref vd0, (nuint)2) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); - Unsafe.Add(ref vd0, (nuint)3) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); + vd0 = Vector256_.Shuffle(vs0, mask); + Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); + Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); + Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector256Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask); + Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), mask); } } } - else if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte) + else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte) { Span temp = stackalloc byte[Vector128.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -423,17 +423,17 @@ internal static partial class SimdUtils ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector128 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector128Utilities.Shuffle(vs0, mask); - Unsafe.Add(ref vd0, (nuint)1) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); - Unsafe.Add(ref vd0, (nuint)2) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); - Unsafe.Add(ref vd0, (nuint)3) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); + vd0 = Vector128_.Shuffle(vs0, mask); + Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); + Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); + Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector128Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask); + Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), mask); } } } @@ -445,11 +445,11 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsRightAlign) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); - Vector128 maskE = Vector128Utilities.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); + Vector128 maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); Span bytes = stackalloc byte[Vector128.Count]; Shuffle.MMShuffleSpan(ref bytes, control); @@ -467,28 +467,28 @@ internal static partial class SimdUtils Vector128 v0 = vs; Vector128 v1 = Unsafe.Add(ref vs, (nuint)1); Vector128 v2 = Unsafe.Add(ref vs, (nuint)2); - Vector128 v3 = Vector128Utilities.ShiftRightBytesInVector(v2, 4); + Vector128 v3 = Vector128_.ShiftRightBytesInVector(v2, 4); - v2 = Vector128Utilities.AlignRight(v2, v1, 8); - v1 = Vector128Utilities.AlignRight(v1, v0, 12); + v2 = Vector128_.AlignRight(v2, v1, 8); + v1 = Vector128_.AlignRight(v1, v0, 12); - v0 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, maskPad4Nx16), mask); - v1 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, maskPad4Nx16), mask); - v2 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, maskPad4Nx16), mask); - v3 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, maskPad4Nx16), mask); + v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16), mask); + v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16), mask); + v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16), mask); + v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16), mask); - v0 = Vector128Utilities.Shuffle(v0, maskE); - v1 = Vector128Utilities.Shuffle(v1, maskSlice4Nx16); - v2 = Vector128Utilities.Shuffle(v2, maskE); - v3 = Vector128Utilities.Shuffle(v3, maskSlice4Nx16); + v0 = Vector128_.Shuffle(v0, maskE); + v1 = Vector128_.Shuffle(v1, maskSlice4Nx16); + v2 = Vector128_.Shuffle(v2, maskE); + v3 = Vector128_.Shuffle(v3, maskSlice4Nx16); - v0 = Vector128Utilities.AlignRight(v1, v0, 4); - v3 = Vector128Utilities.AlignRight(v3, v2, 12); + v0 = Vector128_.AlignRight(v1, v0, 4); + v3 = Vector128_.AlignRight(v3, v2, 12); - v1 = Vector128Utilities.ShiftLeftBytesInVector(v1, 4); - v2 = Vector128Utilities.ShiftRightBytesInVector(v2, 4); + v1 = Vector128_.ShiftLeftBytesInVector(v1, 4); + v2 = Vector128_.ShiftRightBytesInVector(v2, 4); - v1 = Vector128Utilities.AlignRight(v2, v1, 8); + v1 = Vector128_.AlignRight(v2, v1, 8); ref Vector128 vd = ref Unsafe.Add(ref destinationBase, i); @@ -505,7 +505,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 fill = Vector128.Create(0xff000000ff000000ul).AsByte(); @@ -527,17 +527,17 @@ internal static partial class SimdUtils ref Vector128 v0 = ref Unsafe.Add(ref sourceBase, i); Vector128 v1 = Unsafe.Add(ref v0, 1); Vector128 v2 = Unsafe.Add(ref v0, 2); - Vector128 v3 = Vector128Utilities.ShiftRightBytesInVector(v2, 4); + Vector128 v3 = Vector128_.ShiftRightBytesInVector(v2, 4); - v2 = Vector128Utilities.AlignRight(v2, v1, 8); - v1 = Vector128Utilities.AlignRight(v1, v0, 12); + v2 = Vector128_.AlignRight(v2, v1, 8); + v1 = Vector128_.AlignRight(v1, v0, 12); ref Vector128 vd = ref Unsafe.Add(ref destinationBase, j); - vd = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, maskPad4Nx16) | fill, mask); - Unsafe.Add(ref vd, 1) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, maskPad4Nx16) | fill, mask); - Unsafe.Add(ref vd, 2) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, maskPad4Nx16) | fill, mask); - Unsafe.Add(ref vd, 3) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, maskPad4Nx16) | fill, mask); + vd = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16) | fill, mask); + Unsafe.Add(ref vd, 1) = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16) | fill, mask); + Unsafe.Add(ref vd, 2) = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16) | fill, mask); + Unsafe.Add(ref vd, 3) = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16) | fill, mask); } } } @@ -548,10 +548,10 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) { Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); - Vector128 maskE = Vector128Utilities.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); + Vector128 maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); Span temp = stackalloc byte[Vector128.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -574,18 +574,18 @@ internal static partial class SimdUtils Vector128 v2 = Unsafe.Add(ref vs, 2); Vector128 v3 = Unsafe.Add(ref vs, 3); - v0 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, mask), maskE); - v1 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, mask), maskSlice4Nx16); - v2 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, mask), maskE); - v3 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, mask), maskSlice4Nx16); + v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, mask), maskE); + v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, mask), maskSlice4Nx16); + v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, mask), maskE); + v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, mask), maskSlice4Nx16); - v0 = Vector128Utilities.AlignRight(v1, v0, 4); - v3 = Vector128Utilities.AlignRight(v3, v2, 12); + v0 = Vector128_.AlignRight(v1, v0, 4); + v3 = Vector128_.AlignRight(v3, v2, 12); - v1 = Vector128Utilities.ShiftLeftBytesInVector(v1, 4); - v2 = Vector128Utilities.ShiftRightBytesInVector(v2, 4); + v1 = Vector128_.ShiftLeftBytesInVector(v1, 4); + v2 = Vector128_.ShiftRightBytesInVector(v2, 4); - v1 = Vector128Utilities.AlignRight(v2, v1, 8); + v1 = Vector128_.AlignRight(v2, v1, 8); ref Vector128 vd = ref Unsafe.Add(ref destinationBase, j); @@ -965,10 +965,10 @@ internal static partial class SimdUtils Vector512 f2 = scale * Unsafe.Add(ref s, 2); Vector512 f3 = scale * Unsafe.Add(ref s, 3); - Vector512 w0 = Vector512Utilities.ConvertToInt32RoundToEven(f0); - Vector512 w1 = Vector512Utilities.ConvertToInt32RoundToEven(f1); - Vector512 w2 = Vector512Utilities.ConvertToInt32RoundToEven(f2); - Vector512 w3 = Vector512Utilities.ConvertToInt32RoundToEven(f3); + Vector512 w0 = Vector512_.ConvertToInt32RoundToEven(f0); + Vector512 w1 = Vector512_.ConvertToInt32RoundToEven(f1); + Vector512 w2 = Vector512_.ConvertToInt32RoundToEven(f2); + Vector512 w3 = Vector512_.ConvertToInt32RoundToEven(f3); Vector512 u0 = Avx512BW.PackSignedSaturate(w0, w1); Vector512 u1 = Avx512BW.PackSignedSaturate(w2, w3); @@ -999,10 +999,10 @@ internal static partial class SimdUtils Vector256 f2 = scale * Unsafe.Add(ref s, 2); Vector256 f3 = scale * Unsafe.Add(ref s, 3); - Vector256 w0 = Vector256Utilities.ConvertToInt32RoundToEven(f0); - Vector256 w1 = Vector256Utilities.ConvertToInt32RoundToEven(f1); - Vector256 w2 = Vector256Utilities.ConvertToInt32RoundToEven(f2); - Vector256 w3 = Vector256Utilities.ConvertToInt32RoundToEven(f3); + Vector256 w0 = Vector256_.ConvertToInt32RoundToEven(f0); + Vector256 w1 = Vector256_.ConvertToInt32RoundToEven(f1); + Vector256 w2 = Vector256_.ConvertToInt32RoundToEven(f2); + Vector256 w3 = Vector256_.ConvertToInt32RoundToEven(f3); Vector256 u0 = Avx2.PackSignedSaturate(w0, w1); Vector256 u1 = Avx2.PackSignedSaturate(w2, w3); @@ -1033,15 +1033,15 @@ internal static partial class SimdUtils Vector128 f2 = scale * Unsafe.Add(ref s, 2); Vector128 f3 = scale * Unsafe.Add(ref s, 3); - Vector128 w0 = Vector128Utilities.ConvertToInt32RoundToEven(f0); - Vector128 w1 = Vector128Utilities.ConvertToInt32RoundToEven(f1); - Vector128 w2 = Vector128Utilities.ConvertToInt32RoundToEven(f2); - Vector128 w3 = Vector128Utilities.ConvertToInt32RoundToEven(f3); + Vector128 w0 = Vector128_.ConvertToInt32RoundToEven(f0); + Vector128 w1 = Vector128_.ConvertToInt32RoundToEven(f1); + Vector128 w2 = Vector128_.ConvertToInt32RoundToEven(f2); + Vector128 w3 = Vector128_.ConvertToInt32RoundToEven(f3); - Vector128 u0 = Vector128Utilities.PackSignedSaturate(w0, w1); - Vector128 u1 = Vector128Utilities.PackSignedSaturate(w2, w3); + Vector128 u0 = Vector128_.PackSignedSaturate(w0, w1); + Vector128 u1 = Vector128_.PackSignedSaturate(w2, w3); - Unsafe.Add(ref destinationBase, i) = Vector128Utilities.PackUnsignedSaturate(u0, u1); + Unsafe.Add(ref destinationBase, i) = Vector128_.PackUnsignedSaturate(u0, u1); } } } diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index e99eecc42..85b09b351 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -19,7 +19,9 @@ namespace SixLabors.ImageSharp.Common.Helpers; /// /// Should only be used if the intrinsics are available. /// -internal static class Vector128Utilities +#pragma warning disable SA1649 // File name should match first type name +internal static class Vector128_ +#pragma warning restore SA1649 // File name should match first type name { /// /// Gets a value indicating whether shuffle operations are supported. @@ -314,8 +316,8 @@ internal static class Vector128Utilities return Vector128.Narrow(lefClamped, rightClamped); } - /// Restricts a vector between a minimum and a maximum value. + /// + /// Restricts a vector between a minimum and a maximum value. /// /// The type of the elements in the vector. /// The vector to restrict. diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 4c12cb272..893b6240d 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -17,7 +17,9 @@ namespace SixLabors.ImageSharp.Common.Helpers; /// /// Should only be used if the intrinsics are available. /// -internal static class Vector256Utilities +#pragma warning disable SA1649 // File name should match first type name +internal static class Vector256_ +#pragma warning restore SA1649 // File name should match first type name { /// /// Gets a value indicating whether shuffle byte operations are supported. @@ -152,6 +154,18 @@ internal static class Vector256Utilities return va + (vm0 * vm1); } + /// + /// Restricts a vector between a minimum and a maximum value. + /// + /// The type of the elements in the vector. + /// The vector to restrict. + /// The minimum value. + /// The maximum value. + /// The restricted . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 Clamp(Vector256 value, Vector256 min, Vector256 max) + => Vector256.Min(Vector256.Max(value, min), max); + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index 40e8ac344..3c773bc52 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -17,7 +17,9 @@ namespace SixLabors.ImageSharp.Common.Helpers; /// /// Should only be used if the intrinsics are available. /// -internal static class Vector512Utilities +#pragma warning disable SA1649 // File name should match first type name +internal static class Vector512_ +#pragma warning restore SA1649 // File name should match first type name { /// /// Gets a value indicating whether shuffle float operations are supported. @@ -126,6 +128,13 @@ internal static class Vector512Utilities return Avx512F.RoundScale(vector, 0b0000_1000); } + if (Avx.IsSupported) + { + Vector256 lower = Avx.RoundToNearestInteger(vector.GetLower()); + Vector256 upper = Avx.RoundToNearestInteger(vector.GetUpper()); + return Vector512.Create(lower, upper); + } + Vector512 sign = vector & Vector512.Create(-0F); Vector512 val_2p23_f32 = sign | Vector512.Create(8388608F); @@ -152,9 +161,28 @@ internal static class Vector512Utilities return Avx512F.FusedMultiplyAdd(vm0, vm1, va); } + if (Fma.IsSupported) + { + Vector256 lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower()); + Vector256 upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper()); + return Vector512.Create(lower, upper); + } + return va + (vm0 * vm1); } + /// + /// Restricts a vector between a minimum and a maximum value. + /// + /// The type of the elements in the vector. + /// The vector to restrict. + /// The minimum value. + /// The maximum value. + /// The restricted . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector512 Clamp(Vector512 value, Vector512 min, Vector512 max) + => Vector512.Min(Vector512.Max(value, min), max); + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs deleted file mode 100644 index 3921eccb7..000000000 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Six Labors Split License. - -using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; - -namespace SixLabors.ImageSharp.Formats.Jpeg.Components; - -internal partial struct Block8x8F -{ - /// - /// A number of rows of 8 scalar coefficients each in - /// - public const int RowCount = 8; - - [FieldOffset(0)] - public Vector256 V0; - [FieldOffset(32)] - public Vector256 V1; - [FieldOffset(64)] - public Vector256 V2; - [FieldOffset(96)] - public Vector256 V3; - [FieldOffset(128)] - public Vector256 V4; - [FieldOffset(160)] - public Vector256 V5; - [FieldOffset(192)] - public Vector256 V6; - [FieldOffset(224)] - public Vector256 V7; - - private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) - { - DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); - - ref Vector256 aBase = ref a.V0; - ref Vector256 bBase = ref b.V0; - - ref Vector256 destRef = ref dest.V01; - Vector256 multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); - - for (nuint i = 0; i < 8; i += 2) - { - Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); - - Vector256 row = Avx2.PackSignedSaturate(row0, row1); - row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16(); - - Unsafe.Add(ref destRef, i / 2) = row; - } - } - - private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) - { - DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!"); - - ref Vector128 aBase = ref Unsafe.As>(ref a); - ref Vector128 bBase = ref Unsafe.As>(ref b); - - ref Vector128 destBase = ref Unsafe.As>(ref dest); - - // TODO: We can use the v128 utilities for this. - for (nuint i = 0; i < 16; i += 2) - { - Vector128 left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector128 right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); - - Vector128 row = Sse2.PackSignedSaturate(left, right); - Unsafe.Add(ref destBase, i / 2) = row; - } - } - - private void TransposeInplace_Avx() - { - // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 - Vector256 r0 = Avx.InsertVector128( - this.V0, - Unsafe.As>(ref this.V4L), - 1); - - Vector256 r1 = Avx.InsertVector128( - this.V1, - Unsafe.As>(ref this.V5L), - 1); - - Vector256 r2 = Avx.InsertVector128( - this.V2, - Unsafe.As>(ref this.V6L), - 1); - - Vector256 r3 = Avx.InsertVector128( - this.V3, - Unsafe.As>(ref this.V7L), - 1); - - Vector256 r4 = Avx.InsertVector128( - Unsafe.As>(ref this.V0R).ToVector256(), - Unsafe.As>(ref this.V4R), - 1); - - Vector256 r5 = Avx.InsertVector128( - Unsafe.As>(ref this.V1R).ToVector256(), - Unsafe.As>(ref this.V5R), - 1); - - Vector256 r6 = Avx.InsertVector128( - Unsafe.As>(ref this.V2R).ToVector256(), - Unsafe.As>(ref this.V6R), - 1); - - Vector256 r7 = Avx.InsertVector128( - Unsafe.As>(ref this.V3R).ToVector256(), - Unsafe.As>(ref this.V7R), - 1); - - Vector256 t0 = Avx.UnpackLow(r0, r1); - Vector256 t2 = Avx.UnpackLow(r2, r3); - Vector256 v = Avx.Shuffle(t0, t2, 0x4E); - this.V0 = Avx.Blend(t0, v, 0xCC); - this.V1 = Avx.Blend(t2, v, 0x33); - - Vector256 t4 = Avx.UnpackLow(r4, r5); - Vector256 t6 = Avx.UnpackLow(r6, r7); - v = Avx.Shuffle(t4, t6, 0x4E); - this.V4 = Avx.Blend(t4, v, 0xCC); - this.V5 = Avx.Blend(t6, v, 0x33); - - Vector256 t1 = Avx.UnpackHigh(r0, r1); - Vector256 t3 = Avx.UnpackHigh(r2, r3); - v = Avx.Shuffle(t1, t3, 0x4E); - this.V2 = Avx.Blend(t1, v, 0xCC); - this.V3 = Avx.Blend(t3, v, 0x33); - - Vector256 t5 = Avx.UnpackHigh(r4, r5); - Vector256 t7 = Avx.UnpackHigh(r6, r7); - v = Avx.Shuffle(t5, t7, 0x4E); - this.V6 = Avx.Blend(t5, v, 0xCC); - this.V7 = Avx.Blend(t7, v, 0x33); - } -} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs deleted file mode 100644 index 899a883e4..000000000 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Six Labors Split License. - -using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics; - -namespace SixLabors.ImageSharp.Formats.Jpeg.Components; - -internal partial struct Block8x8F -{ - /// - /// Level shift by +maximum/2, clip to [0, maximum] - /// - /// The maximum value to normalize to. - public void NormalizeColorsInPlace(float maximum) - { - Vector4 min = Vector4.Zero; - Vector4 max = new(maximum); - Vector4 off = new(MathF.Ceiling(maximum * 0.5F)); - - this.V0L = Vector4.Clamp(this.V0L + off, min, max); - this.V0R = Vector4.Clamp(this.V0R + off, min, max); - this.V1L = Vector4.Clamp(this.V1L + off, min, max); - this.V1R = Vector4.Clamp(this.V1R + off, min, max); - this.V2L = Vector4.Clamp(this.V2L + off, min, max); - this.V2R = Vector4.Clamp(this.V2R + off, min, max); - this.V3L = Vector4.Clamp(this.V3L + off, min, max); - this.V3R = Vector4.Clamp(this.V3R + off, min, max); - this.V4L = Vector4.Clamp(this.V4L + off, min, max); - this.V4R = Vector4.Clamp(this.V4R + off, min, max); - this.V5L = Vector4.Clamp(this.V5L + off, min, max); - this.V5R = Vector4.Clamp(this.V5R + off, min, max); - this.V6L = Vector4.Clamp(this.V6L + off, min, max); - this.V6R = Vector4.Clamp(this.V6R + off, min, max); - this.V7L = Vector4.Clamp(this.V7L + off, min, max); - this.V7R = Vector4.Clamp(this.V7R + off, min, max); - } - - /// - /// version of and . - /// - /// The maximum value to normalize to. - [MethodImpl(InliningOptions.ShortMethod)] - public void NormalizeColorsAndRoundInPlaceVector256(float maximum) - { - Vector256 off = Vector256.Create(MathF.Ceiling(maximum * 0.5F)); - Vector256 max = Vector256.Create(maximum); - - ref Vector256 row0 = ref Unsafe.As>(ref this.V0L); - row0 = NormalizeAndRoundVector256(row0, off, max); - - ref Vector256 row1 = ref Unsafe.As>(ref this.V1L); - row1 = NormalizeAndRoundVector256(row1, off, max); - - ref Vector256 row2 = ref Unsafe.As>(ref this.V2L); - row2 = NormalizeAndRoundVector256(row2, off, max); - - ref Vector256 row3 = ref Unsafe.As>(ref this.V3L); - row3 = NormalizeAndRoundVector256(row3, off, max); - - ref Vector256 row4 = ref Unsafe.As>(ref this.V4L); - row4 = NormalizeAndRoundVector256(row4, off, max); - - ref Vector256 row5 = ref Unsafe.As>(ref this.V5L); - row5 = NormalizeAndRoundVector256(row5, off, max); - - ref Vector256 row6 = ref Unsafe.As>(ref this.V6L); - row6 = NormalizeAndRoundVector256(row6, off, max); - - ref Vector256 row7 = ref Unsafe.As>(ref this.V7L); - row7 = NormalizeAndRoundVector256(row7, off, max); - } - - /// - /// version of and . - /// - /// The maximum value to normalize to. - [MethodImpl(InliningOptions.ShortMethod)] - public void NormalizeColorsAndRoundInPlaceVector128(float maximum) - { - Vector128 off = Vector128.Create(MathF.Ceiling(maximum * 0.5F)); - Vector128 max = Vector128.Create(maximum); - - this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4(); - this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4(); - this.V1L = NormalizeAndRoundVector128(this.V1L.AsVector128(), off, max).AsVector4(); - this.V1R = NormalizeAndRoundVector128(this.V1R.AsVector128(), off, max).AsVector4(); - this.V2L = NormalizeAndRoundVector128(this.V2L.AsVector128(), off, max).AsVector4(); - this.V2R = NormalizeAndRoundVector128(this.V2R.AsVector128(), off, max).AsVector4(); - this.V3L = NormalizeAndRoundVector128(this.V3L.AsVector128(), off, max).AsVector4(); - this.V3R = NormalizeAndRoundVector128(this.V3R.AsVector128(), off, max).AsVector4(); - this.V4L = NormalizeAndRoundVector128(this.V4L.AsVector128(), off, max).AsVector4(); - this.V4R = NormalizeAndRoundVector128(this.V4R.AsVector128(), off, max).AsVector4(); - this.V5L = NormalizeAndRoundVector128(this.V5L.AsVector128(), off, max).AsVector4(); - this.V5R = NormalizeAndRoundVector128(this.V5R.AsVector128(), off, max).AsVector4(); - this.V6L = NormalizeAndRoundVector128(this.V6L.AsVector128(), off, max).AsVector4(); - this.V6R = NormalizeAndRoundVector128(this.V6R.AsVector128(), off, max).AsVector4(); - this.V7L = NormalizeAndRoundVector128(this.V7L.AsVector128(), off, max).AsVector4(); - this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4(); - } - - /// - /// Fill the block from 'source' doing short -> float conversion. - /// - /// The source block - public void LoadFromInt16Scalar(ref Block8x8 source) - { - ref short selfRef = ref Unsafe.As(ref source); - - this.V0L.X = Unsafe.Add(ref selfRef, 0); - this.V0L.Y = Unsafe.Add(ref selfRef, 1); - this.V0L.Z = Unsafe.Add(ref selfRef, 2); - this.V0L.W = Unsafe.Add(ref selfRef, 3); - this.V0R.X = Unsafe.Add(ref selfRef, 4); - this.V0R.Y = Unsafe.Add(ref selfRef, 5); - this.V0R.Z = Unsafe.Add(ref selfRef, 6); - this.V0R.W = Unsafe.Add(ref selfRef, 7); - - this.V1L.X = Unsafe.Add(ref selfRef, 8); - this.V1L.Y = Unsafe.Add(ref selfRef, 9); - this.V1L.Z = Unsafe.Add(ref selfRef, 10); - this.V1L.W = Unsafe.Add(ref selfRef, 11); - this.V1R.X = Unsafe.Add(ref selfRef, 12); - this.V1R.Y = Unsafe.Add(ref selfRef, 13); - this.V1R.Z = Unsafe.Add(ref selfRef, 14); - this.V1R.W = Unsafe.Add(ref selfRef, 15); - - this.V2L.X = Unsafe.Add(ref selfRef, 16); - this.V2L.Y = Unsafe.Add(ref selfRef, 17); - this.V2L.Z = Unsafe.Add(ref selfRef, 18); - this.V2L.W = Unsafe.Add(ref selfRef, 19); - this.V2R.X = Unsafe.Add(ref selfRef, 20); - this.V2R.Y = Unsafe.Add(ref selfRef, 21); - this.V2R.Z = Unsafe.Add(ref selfRef, 22); - this.V2R.W = Unsafe.Add(ref selfRef, 23); - - this.V3L.X = Unsafe.Add(ref selfRef, 24); - this.V3L.Y = Unsafe.Add(ref selfRef, 25); - this.V3L.Z = Unsafe.Add(ref selfRef, 26); - this.V3L.W = Unsafe.Add(ref selfRef, 27); - this.V3R.X = Unsafe.Add(ref selfRef, 28); - this.V3R.Y = Unsafe.Add(ref selfRef, 29); - this.V3R.Z = Unsafe.Add(ref selfRef, 30); - this.V3R.W = Unsafe.Add(ref selfRef, 31); - - this.V4L.X = Unsafe.Add(ref selfRef, 32); - this.V4L.Y = Unsafe.Add(ref selfRef, 33); - this.V4L.Z = Unsafe.Add(ref selfRef, 34); - this.V4L.W = Unsafe.Add(ref selfRef, 35); - this.V4R.X = Unsafe.Add(ref selfRef, 36); - this.V4R.Y = Unsafe.Add(ref selfRef, 37); - this.V4R.Z = Unsafe.Add(ref selfRef, 38); - this.V4R.W = Unsafe.Add(ref selfRef, 39); - - this.V5L.X = Unsafe.Add(ref selfRef, 40); - this.V5L.Y = Unsafe.Add(ref selfRef, 41); - this.V5L.Z = Unsafe.Add(ref selfRef, 42); - this.V5L.W = Unsafe.Add(ref selfRef, 43); - this.V5R.X = Unsafe.Add(ref selfRef, 44); - this.V5R.Y = Unsafe.Add(ref selfRef, 45); - this.V5R.Z = Unsafe.Add(ref selfRef, 46); - this.V5R.W = Unsafe.Add(ref selfRef, 47); - - this.V6L.X = Unsafe.Add(ref selfRef, 48); - this.V6L.Y = Unsafe.Add(ref selfRef, 49); - this.V6L.Z = Unsafe.Add(ref selfRef, 50); - this.V6L.W = Unsafe.Add(ref selfRef, 51); - this.V6R.X = Unsafe.Add(ref selfRef, 52); - this.V6R.Y = Unsafe.Add(ref selfRef, 53); - this.V6R.Z = Unsafe.Add(ref selfRef, 54); - this.V6R.W = Unsafe.Add(ref selfRef, 55); - - this.V7L.X = Unsafe.Add(ref selfRef, 56); - this.V7L.Y = Unsafe.Add(ref selfRef, 57); - this.V7L.Z = Unsafe.Add(ref selfRef, 58); - this.V7L.W = Unsafe.Add(ref selfRef, 59); - this.V7R.X = Unsafe.Add(ref selfRef, 60); - this.V7R.Y = Unsafe.Add(ref selfRef, 61); - this.V7R.Z = Unsafe.Add(ref selfRef, 62); - this.V7R.W = Unsafe.Add(ref selfRef, 63); - } -} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs new file mode 100644 index 000000000..37332db62 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs @@ -0,0 +1,66 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +/// +/// version of . +/// +internal partial struct Block8x8F +{ + /// + /// version of and . + /// + /// The maximum value to normalize to. + [MethodImpl(InliningOptions.ShortMethod)] + public void NormalizeColorsAndRoundInPlaceVector128(float maximum) + { + Vector128 off = Vector128.Create(MathF.Ceiling(maximum * 0.5F)); + Vector128 max = Vector128.Create(maximum); + + this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4(); + this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4(); + this.V1L = NormalizeAndRoundVector128(this.V1L.AsVector128(), off, max).AsVector4(); + this.V1R = NormalizeAndRoundVector128(this.V1R.AsVector128(), off, max).AsVector4(); + this.V2L = NormalizeAndRoundVector128(this.V2L.AsVector128(), off, max).AsVector4(); + this.V2R = NormalizeAndRoundVector128(this.V2R.AsVector128(), off, max).AsVector4(); + this.V3L = NormalizeAndRoundVector128(this.V3L.AsVector128(), off, max).AsVector4(); + this.V3R = NormalizeAndRoundVector128(this.V3R.AsVector128(), off, max).AsVector4(); + this.V4L = NormalizeAndRoundVector128(this.V4L.AsVector128(), off, max).AsVector4(); + this.V4R = NormalizeAndRoundVector128(this.V4R.AsVector128(), off, max).AsVector4(); + this.V5L = NormalizeAndRoundVector128(this.V5L.AsVector128(), off, max).AsVector4(); + this.V5R = NormalizeAndRoundVector128(this.V5R.AsVector128(), off, max).AsVector4(); + this.V6L = NormalizeAndRoundVector128(this.V6L.AsVector128(), off, max).AsVector4(); + this.V6R = NormalizeAndRoundVector128(this.V6R.AsVector128(), off, max).AsVector4(); + this.V7L = NormalizeAndRoundVector128(this.V7L.AsVector128(), off, max).AsVector4(); + this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4(); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 NormalizeAndRoundVector128(Vector128 value, Vector128 off, Vector128 max) + => Vector128_.RoundToNearestInteger(Vector128_.Clamp(value + off, Vector128.Zero, max)); + + private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + { + DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!"); + + ref Vector128 aBase = ref Unsafe.As>(ref a); + ref Vector128 bBase = ref Unsafe.As>(ref b); + + ref Vector128 destBase = ref Unsafe.As>(ref dest); + + // TODO: We can use the v128 utilities for this. + for (nuint i = 0; i < 16; i += 2) + { + Vector128 left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector128 right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + + Unsafe.Add(ref destBase, i / 2) = Sse2.PackSignedSaturate(left, right); + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs new file mode 100644 index 000000000..a7d5c89b3 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs @@ -0,0 +1,191 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +/// +/// version of . +/// +internal partial struct Block8x8F +{ + /// + /// A number of rows of 8 scalar coefficients each in + /// + public const int RowCount = 8; + +#pragma warning disable SA1310 // Field names should not contain underscore + [FieldOffset(0)] + public Vector256 V256_0; + [FieldOffset(32)] + public Vector256 V256_1; + [FieldOffset(64)] + public Vector256 V256_2; + [FieldOffset(96)] + public Vector256 V256_3; + [FieldOffset(128)] + public Vector256 V256_4; + [FieldOffset(160)] + public Vector256 V256_5; + [FieldOffset(192)] + public Vector256 V256_6; + [FieldOffset(224)] + public Vector256 V256_7; +#pragma warning restore SA1310 // Field names should not contain underscore + + /// + /// version of and . + /// + /// The maximum value to normalize to. + [MethodImpl(InliningOptions.ShortMethod)] + public void NormalizeColorsAndRoundInPlaceVector256(float maximum) + { + Vector256 off = Vector256.Create(MathF.Ceiling(maximum * 0.5F)); + Vector256 max = Vector256.Create(maximum); + + this.V256_0 = NormalizeAndRoundVector256(this.V256_0, off, max); + this.V256_1 = NormalizeAndRoundVector256(this.V256_1, off, max); + this.V256_2 = NormalizeAndRoundVector256(this.V256_2, off, max); + this.V256_3 = NormalizeAndRoundVector256(this.V256_3, off, max); + this.V256_4 = NormalizeAndRoundVector256(this.V256_4, off, max); + this.V256_5 = NormalizeAndRoundVector256(this.V256_5, off, max); + this.V256_6 = NormalizeAndRoundVector256(this.V256_6, off, max); + this.V256_7 = NormalizeAndRoundVector256(this.V256_7, off, max); + } + + /// + /// Loads values from using extended AVX2 intrinsics. + /// + /// The source + public void LoadFromInt16ExtendedAvx2(ref Block8x8 source) + { + DebugGuard.IsTrue( + Avx2.IsSupported, + "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!"); + + ref short sRef = ref Unsafe.As(ref source); + ref Vector256 dRef = ref Unsafe.As>(ref this); + + // Vector256.Count == 16 on AVX2 + // We can process 2 block rows in a single step + Vector256 top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef)); + Vector256 bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256.Count)); + dRef = Avx.ConvertToVector256Single(top); + Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom); + + top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 2))); + bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 3))); + Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top); + Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom); + + top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 4))); + bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 5))); + Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top); + Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom); + + top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 6))); + bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 7))); + Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top); + Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector256 NormalizeAndRoundVector256(Vector256 value, Vector256 off, Vector256 max) + => Vector256_.RoundToNearestInteger(Vector256_.Clamp(value + off, Vector256.Zero, max)); + + private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + { + DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); + + ref Vector256 aBase = ref a.V256_0; + ref Vector256 bBase = ref b.V256_0; + + ref Vector256 destRef = ref dest.V01; + Vector256 multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); + + for (nuint i = 0; i < 8; i += 2) + { + Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + + Vector256 row = Avx2.PackSignedSaturate(row0, row1); + row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16(); + + Unsafe.Add(ref destRef, i / 2) = row; + } + } + + private void TransposeInplace_Avx() + { + // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 + Vector256 r0 = Avx.InsertVector128( + this.V256_0, + Unsafe.As>(ref this.V4L), + 1); + + Vector256 r1 = Avx.InsertVector128( + this.V256_1, + Unsafe.As>(ref this.V5L), + 1); + + Vector256 r2 = Avx.InsertVector128( + this.V256_2, + Unsafe.As>(ref this.V6L), + 1); + + Vector256 r3 = Avx.InsertVector128( + this.V256_3, + Unsafe.As>(ref this.V7L), + 1); + + Vector256 r4 = Avx.InsertVector128( + Unsafe.As>(ref this.V0R).ToVector256(), + Unsafe.As>(ref this.V4R), + 1); + + Vector256 r5 = Avx.InsertVector128( + Unsafe.As>(ref this.V1R).ToVector256(), + Unsafe.As>(ref this.V5R), + 1); + + Vector256 r6 = Avx.InsertVector128( + Unsafe.As>(ref this.V2R).ToVector256(), + Unsafe.As>(ref this.V6R), + 1); + + Vector256 r7 = Avx.InsertVector128( + Unsafe.As>(ref this.V3R).ToVector256(), + Unsafe.As>(ref this.V7R), + 1); + + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackLow(r2, r3); + Vector256 v = Avx.Shuffle(t0, t2, 0x4E); + this.V256_0 = Avx.Blend(t0, v, 0xCC); + this.V256_1 = Avx.Blend(t2, v, 0x33); + + Vector256 t4 = Avx.UnpackLow(r4, r5); + Vector256 t6 = Avx.UnpackLow(r6, r7); + v = Avx.Shuffle(t4, t6, 0x4E); + this.V256_4 = Avx.Blend(t4, v, 0xCC); + this.V256_5 = Avx.Blend(t6, v, 0x33); + + Vector256 t1 = Avx.UnpackHigh(r0, r1); + Vector256 t3 = Avx.UnpackHigh(r2, r3); + v = Avx.Shuffle(t1, t3, 0x4E); + this.V256_2 = Avx.Blend(t1, v, 0xCC); + this.V256_3 = Avx.Blend(t3, v, 0x33); + + Vector256 t5 = Avx.UnpackHigh(r4, r5); + Vector256 t7 = Avx.UnpackHigh(r6, r7); + v = Avx.Shuffle(t5, t7, 0x4E); + this.V256_6 = Avx.Blend(t5, v, 0xCC); + this.V256_7 = Avx.Blend(t7, v, 0x33); + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 2eecafc13..ec563897d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -8,8 +8,6 @@ using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using System.Text; using SixLabors.ImageSharp.Common.Helpers; -using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities; -using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Jpeg.Components; @@ -25,7 +23,6 @@ internal partial struct Block8x8F : IEquatable /// public const int Size = 64; -#pragma warning disable SA1600 // ElementsMustBeDocumented [FieldOffset(0)] public Vector4 V0L; [FieldOffset(16)] @@ -65,7 +62,6 @@ internal partial struct Block8x8F : IEquatable public Vector4 V7L; [FieldOffset(240)] public Vector4 V7R; -#pragma warning restore SA1600 // ElementsMustBeDocumented /// /// Get/Set scalar elements at a given index @@ -159,18 +155,17 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public void MultiplyInPlace(float value) { - // TODO: Vector512 if (Vector256.IsHardwareAccelerated) { Vector256 valueVec = Vector256.Create(value); - this.V0 *= valueVec; - this.V1 *= valueVec; - this.V2 *= valueVec; - this.V3 *= valueVec; - this.V4 *= valueVec; - this.V5 *= valueVec; - this.V6 *= valueVec; - this.V7 *= valueVec; + this.V256_0 *= valueVec; + this.V256_1 *= valueVec; + this.V256_2 *= valueVec; + this.V256_3 *= valueVec; + this.V256_4 *= valueVec; + this.V256_5 *= valueVec; + this.V256_6 *= valueVec; + this.V256_7 *= valueVec; } else { @@ -201,17 +196,16 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public unsafe void MultiplyInPlace(ref Block8x8F other) { - // TODO: Vector512 if (Vector256.IsHardwareAccelerated) { - this.V0 *= other.V0; - this.V1 *= other.V1; - this.V2 *= other.V2; - this.V3 *= other.V3; - this.V4 *= other.V4; - this.V5 *= other.V5; - this.V6 *= other.V6; - this.V7 *= other.V7; + this.V256_0 *= other.V256_0; + this.V256_1 *= other.V256_1; + this.V256_2 *= other.V256_2; + this.V256_3 *= other.V256_3; + this.V256_4 *= other.V256_4; + this.V256_5 *= other.V256_5; + this.V256_6 *= other.V256_6; + this.V256_7 *= other.V256_7; } else { @@ -241,18 +235,17 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public void AddInPlace(float value) { - // TODO: Vector512 if (Vector256.IsHardwareAccelerated) { Vector256 valueVec = Vector256.Create(value); - this.V0 += valueVec; - this.V1 += valueVec; - this.V2 += valueVec; - this.V3 += valueVec; - this.V4 += valueVec; - this.V5 += valueVec; - this.V6 += valueVec; - this.V7 += valueVec; + this.V256_0 += valueVec; + this.V256_1 += valueVec; + this.V256_2 += valueVec; + this.V256_3 += valueVec; + this.V256_4 += valueVec; + this.V256_5 += valueVec; + this.V256_6 += valueVec; + this.V256_7 += valueVec; } else { @@ -352,6 +345,34 @@ internal partial struct Block8x8F : IEquatable } } + /// + /// Level shift by +maximum/2, clip to [0, maximum] + /// + /// The maximum value to normalize to. + public void NormalizeColorsInPlace(float maximum) + { + Vector4 min = Vector4.Zero; + Vector4 max = new(maximum); + Vector4 off = new(MathF.Ceiling(maximum * 0.5F)); + + this.V0L = Vector4.Clamp(this.V0L + off, min, max); + this.V0R = Vector4.Clamp(this.V0R + off, min, max); + this.V1L = Vector4.Clamp(this.V1L + off, min, max); + this.V1R = Vector4.Clamp(this.V1R + off, min, max); + this.V2L = Vector4.Clamp(this.V2L + off, min, max); + this.V2R = Vector4.Clamp(this.V2R + off, min, max); + this.V3L = Vector4.Clamp(this.V3L + off, min, max); + this.V3R = Vector4.Clamp(this.V3R + off, min, max); + this.V4L = Vector4.Clamp(this.V4L + off, min, max); + this.V4R = Vector4.Clamp(this.V4R + off, min, max); + this.V5L = Vector4.Clamp(this.V5L + off, min, max); + this.V5R = Vector4.Clamp(this.V5R + off, min, max); + this.V6L = Vector4.Clamp(this.V6L + off, min, max); + this.V6R = Vector4.Clamp(this.V6R + off, min, max); + this.V7L = Vector4.Clamp(this.V7L + off, min, max); + this.V7R = Vector4.Clamp(this.V7R + off, min, max); + } + /// /// Rounds all values in the block. /// @@ -376,39 +397,84 @@ internal partial struct Block8x8F : IEquatable } /// - /// Loads values from using extended AVX2 intrinsics. + /// Fill the block from doing short -> float conversion. /// - /// The source - public void LoadFromInt16ExtendedAvx2(ref Block8x8 source) + /// The source block + public void LoadFromInt16Scalar(ref Block8x8 source) { - DebugGuard.IsTrue( - Avx2.IsSupported, - "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!"); - - ref short sRef = ref Unsafe.As(ref source); - ref Vector256 dRef = ref Unsafe.As>(ref this); - - // Vector256.Count == 16 on AVX2 - // We can process 2 block rows in a single step - Vector256 top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef)); - Vector256 bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256.Count)); - dRef = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom); - - top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 2))); - bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 3))); - Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom); - - top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 4))); - bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 5))); - Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom); - - top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 6))); - bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 7))); - Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom); + ref short selfRef = ref Unsafe.As(ref source); + + this.V0L.X = Unsafe.Add(ref selfRef, 0); + this.V0L.Y = Unsafe.Add(ref selfRef, 1); + this.V0L.Z = Unsafe.Add(ref selfRef, 2); + this.V0L.W = Unsafe.Add(ref selfRef, 3); + this.V0R.X = Unsafe.Add(ref selfRef, 4); + this.V0R.Y = Unsafe.Add(ref selfRef, 5); + this.V0R.Z = Unsafe.Add(ref selfRef, 6); + this.V0R.W = Unsafe.Add(ref selfRef, 7); + + this.V1L.X = Unsafe.Add(ref selfRef, 8); + this.V1L.Y = Unsafe.Add(ref selfRef, 9); + this.V1L.Z = Unsafe.Add(ref selfRef, 10); + this.V1L.W = Unsafe.Add(ref selfRef, 11); + this.V1R.X = Unsafe.Add(ref selfRef, 12); + this.V1R.Y = Unsafe.Add(ref selfRef, 13); + this.V1R.Z = Unsafe.Add(ref selfRef, 14); + this.V1R.W = Unsafe.Add(ref selfRef, 15); + + this.V2L.X = Unsafe.Add(ref selfRef, 16); + this.V2L.Y = Unsafe.Add(ref selfRef, 17); + this.V2L.Z = Unsafe.Add(ref selfRef, 18); + this.V2L.W = Unsafe.Add(ref selfRef, 19); + this.V2R.X = Unsafe.Add(ref selfRef, 20); + this.V2R.Y = Unsafe.Add(ref selfRef, 21); + this.V2R.Z = Unsafe.Add(ref selfRef, 22); + this.V2R.W = Unsafe.Add(ref selfRef, 23); + + this.V3L.X = Unsafe.Add(ref selfRef, 24); + this.V3L.Y = Unsafe.Add(ref selfRef, 25); + this.V3L.Z = Unsafe.Add(ref selfRef, 26); + this.V3L.W = Unsafe.Add(ref selfRef, 27); + this.V3R.X = Unsafe.Add(ref selfRef, 28); + this.V3R.Y = Unsafe.Add(ref selfRef, 29); + this.V3R.Z = Unsafe.Add(ref selfRef, 30); + this.V3R.W = Unsafe.Add(ref selfRef, 31); + + this.V4L.X = Unsafe.Add(ref selfRef, 32); + this.V4L.Y = Unsafe.Add(ref selfRef, 33); + this.V4L.Z = Unsafe.Add(ref selfRef, 34); + this.V4L.W = Unsafe.Add(ref selfRef, 35); + this.V4R.X = Unsafe.Add(ref selfRef, 36); + this.V4R.Y = Unsafe.Add(ref selfRef, 37); + this.V4R.Z = Unsafe.Add(ref selfRef, 38); + this.V4R.W = Unsafe.Add(ref selfRef, 39); + + this.V5L.X = Unsafe.Add(ref selfRef, 40); + this.V5L.Y = Unsafe.Add(ref selfRef, 41); + this.V5L.Z = Unsafe.Add(ref selfRef, 42); + this.V5L.W = Unsafe.Add(ref selfRef, 43); + this.V5R.X = Unsafe.Add(ref selfRef, 44); + this.V5R.Y = Unsafe.Add(ref selfRef, 45); + this.V5R.Z = Unsafe.Add(ref selfRef, 46); + this.V5R.W = Unsafe.Add(ref selfRef, 47); + + this.V6L.X = Unsafe.Add(ref selfRef, 48); + this.V6L.Y = Unsafe.Add(ref selfRef, 49); + this.V6L.Z = Unsafe.Add(ref selfRef, 50); + this.V6L.W = Unsafe.Add(ref selfRef, 51); + this.V6R.X = Unsafe.Add(ref selfRef, 52); + this.V6R.Y = Unsafe.Add(ref selfRef, 53); + this.V6R.Z = Unsafe.Add(ref selfRef, 54); + this.V6R.W = Unsafe.Add(ref selfRef, 55); + + this.V7L.X = Unsafe.Add(ref selfRef, 56); + this.V7L.Y = Unsafe.Add(ref selfRef, 57); + this.V7L.Z = Unsafe.Add(ref selfRef, 58); + this.V7L.W = Unsafe.Add(ref selfRef, 59); + this.V7R.X = Unsafe.Add(ref selfRef, 60); + this.V7R.Y = Unsafe.Add(ref selfRef, 61); + this.V7R.Z = Unsafe.Add(ref selfRef, 62); + this.V7R.W = Unsafe.Add(ref selfRef, 63); } /// @@ -422,11 +488,11 @@ internal partial struct Block8x8F : IEquatable const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); Vector256 targetVector = Vector256.Create(value); - ref Vector256 blockStride = ref this.V0; + ref Vector256 blockStride = ref this.V256_0; for (nuint i = 0; i < RowCount; i++) { - Vector256 areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector); + Vector256 areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V256_0, i)), targetVector); if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask) { return false; @@ -577,31 +643,4 @@ internal partial struct Block8x8F : IEquatable // row #6 RuntimeUtility.Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62)); } - - [MethodImpl(InliningOptions.ShortMethod)] - private static Vector NormalizeAndRound(Vector row, Vector off, Vector max) - { - row += off; - row = Vector.Max(row, Vector.Zero); - row = Vector.Min(row, max); - return row.FastRound(); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static Vector256 NormalizeAndRoundVector256(Vector256 row, Vector256 off, Vector256 max) - { - row += off; - row = Vector256.Max(row, Vector256.Zero); - row = Vector256.Min(row, max); - return Vector256_.RoundToNearestInteger(row); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 NormalizeAndRoundVector128(Vector128 row, Vector128 off, Vector128 max) - { - row += off; - row = Vector128.Max(row, Vector128.Zero); - row = Vector128.Min(row, max); - return Vector128_.RoundToNearestInteger(row); - } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs index f3a6f7d37..4b350f6f3 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs @@ -1,4 +1,4 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Six Labors Split License. using System.Runtime.CompilerServices; @@ -60,7 +60,7 @@ internal abstract partial class JpegColorConverterBase ref Vector128 b = ref Unsafe.Add(ref srcBlue, i); // luminosity = (0.299 * r) + (0.587 * g) + (0.114 * b) - Unsafe.Add(ref destLuminance, i) = Vector128Utilities.MultiplyAdd(Vector128Utilities.MultiplyAdd(f0114 * b, f0587, g), f0299, r); + Unsafe.Add(ref destLuminance, i) = Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(f0114 * b, f0587, g), f0299, r); } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs index 139ffc549..94b897e07 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs @@ -1,10 +1,10 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Six Labors Split License. using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Jpeg.Components; diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs index 21d5eaa6f..638f4278b 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs @@ -1,10 +1,10 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Six Labors Split License. using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Jpeg.Components; diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs index 8cecd3956..6eabb3ee0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs @@ -1,10 +1,10 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Six Labors Split License. using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Jpeg.Components; diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs index f8517e086..233437da9 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs @@ -1,10 +1,10 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Six Labors Split License. using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Jpeg.Components; diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs index 7598a64b2..44c0bcf2b 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs @@ -1,10 +1,10 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Six Labors Split License. using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Jpeg.Components; diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs index 5bb2c5e5b..e36683dee 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs @@ -1,10 +1,10 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Six Labors Split License. using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Jpeg.Components; diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs index 27f2ce035..b1228ba01 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs @@ -1,10 +1,10 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Six Labors Split License. using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Jpeg.Components; diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs index 42d89a231..0db081c6f 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs @@ -4,7 +4,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Jpeg.Components; diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs index b11d834a8..862c77469 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs @@ -26,14 +26,14 @@ internal static partial class FloatingPointDCT // Applies 1D floating point FDCT inplace static void FDCT8x8_1D_Avx(ref Block8x8F block) { - Vector256 tmp0 = Avx.Add(block.V0, block.V7); - Vector256 tmp7 = Avx.Subtract(block.V0, block.V7); - Vector256 tmp1 = Avx.Add(block.V1, block.V6); - Vector256 tmp6 = Avx.Subtract(block.V1, block.V6); - Vector256 tmp2 = Avx.Add(block.V2, block.V5); - Vector256 tmp5 = Avx.Subtract(block.V2, block.V5); - Vector256 tmp3 = Avx.Add(block.V3, block.V4); - Vector256 tmp4 = Avx.Subtract(block.V3, block.V4); + Vector256 tmp0 = Avx.Add(block.V256_0, block.V256_7); + Vector256 tmp7 = Avx.Subtract(block.V256_0, block.V256_7); + Vector256 tmp1 = Avx.Add(block.V256_1, block.V256_6); + Vector256 tmp6 = Avx.Subtract(block.V256_1, block.V256_6); + Vector256 tmp2 = Avx.Add(block.V256_2, block.V256_5); + Vector256 tmp5 = Avx.Subtract(block.V256_2, block.V256_5); + Vector256 tmp3 = Avx.Add(block.V256_3, block.V256_4); + Vector256 tmp4 = Avx.Subtract(block.V256_3, block.V256_4); // Even part Vector256 tmp10 = Avx.Add(tmp0, tmp3); @@ -41,13 +41,13 @@ internal static partial class FloatingPointDCT Vector256 tmp11 = Avx.Add(tmp1, tmp2); Vector256 tmp12 = Avx.Subtract(tmp1, tmp2); - block.V0 = Avx.Add(tmp10, tmp11); - block.V4 = Avx.Subtract(tmp10, tmp11); + block.V256_0 = Avx.Add(tmp10, tmp11); + block.V256_4 = Avx.Subtract(tmp10, tmp11); var mm256_F_0_7071 = Vector256.Create(0.707106781f); Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); - block.V2 = Avx.Add(tmp13, z1); - block.V6 = Avx.Subtract(tmp13, z1); + block.V256_2 = Avx.Add(tmp13, z1); + block.V256_6 = Avx.Subtract(tmp13, z1); // Odd part tmp10 = Avx.Add(tmp4, tmp5); @@ -62,10 +62,10 @@ internal static partial class FloatingPointDCT Vector256 z11 = Avx.Add(tmp7, z3); Vector256 z13 = Avx.Subtract(tmp7, z3); - block.V5 = Avx.Add(z13, z2); - block.V3 = Avx.Subtract(z13, z2); - block.V1 = Avx.Add(z11, z4); - block.V7 = Avx.Subtract(z11, z4); + block.V256_5 = Avx.Add(z13, z2); + block.V256_3 = Avx.Subtract(z13, z2); + block.V256_1 = Avx.Add(z11, z4); + block.V256_7 = Avx.Subtract(z11, z4); } } @@ -88,10 +88,10 @@ internal static partial class FloatingPointDCT static void IDCT8x8_1D_Avx(ref Block8x8F block) { // Even part - Vector256 tmp0 = block.V0; - Vector256 tmp1 = block.V2; - Vector256 tmp2 = block.V4; - Vector256 tmp3 = block.V6; + Vector256 tmp0 = block.V256_0; + Vector256 tmp1 = block.V256_2; + Vector256 tmp2 = block.V256_4; + Vector256 tmp3 = block.V256_6; Vector256 z5 = tmp0; Vector256 tmp10 = Avx.Add(z5, tmp2); @@ -107,10 +107,10 @@ internal static partial class FloatingPointDCT tmp2 = Avx.Subtract(tmp11, tmp12); // Odd part - Vector256 tmp4 = block.V1; - Vector256 tmp5 = block.V3; - Vector256 tmp6 = block.V5; - Vector256 tmp7 = block.V7; + Vector256 tmp4 = block.V256_1; + Vector256 tmp5 = block.V256_3; + Vector256 tmp6 = block.V256_5; + Vector256 tmp7 = block.V256_7; Vector256 z13 = Avx.Add(tmp6, tmp5); Vector256 z10 = Avx.Subtract(tmp6, tmp5); @@ -129,14 +129,14 @@ internal static partial class FloatingPointDCT tmp5 = Avx.Subtract(tmp11, tmp6); tmp4 = Avx.Subtract(tmp10, tmp5); - block.V0 = Avx.Add(tmp0, tmp7); - block.V7 = Avx.Subtract(tmp0, tmp7); - block.V1 = Avx.Add(tmp1, tmp6); - block.V6 = Avx.Subtract(tmp1, tmp6); - block.V2 = Avx.Add(tmp2, tmp5); - block.V5 = Avx.Subtract(tmp2, tmp5); - block.V3 = Avx.Add(tmp3, tmp4); - block.V4 = Avx.Subtract(tmp3, tmp4); + block.V256_0 = Avx.Add(tmp0, tmp7); + block.V256_7 = Avx.Subtract(tmp0, tmp7); + block.V256_1 = Avx.Add(tmp1, tmp6); + block.V256_6 = Avx.Subtract(tmp1, tmp6); + block.V256_2 = Avx.Add(tmp2, tmp5); + block.V256_5 = Avx.Subtract(tmp2, tmp5); + block.V256_3 = Avx.Add(tmp3, tmp4); + block.V256_4 = Avx.Subtract(tmp3, tmp4); } } } diff --git a/src/ImageSharp/Formats/Webp/AlphaDecoder.cs b/src/ImageSharp/Formats/Webp/AlphaDecoder.cs index a9e63a3d0..43dab1ffc 100644 --- a/src/ImageSharp/Formats/Webp/AlphaDecoder.cs +++ b/src/ImageSharp/Formats/Webp/AlphaDecoder.cs @@ -326,11 +326,11 @@ internal class AlphaDecoder : IDisposable { Vector128 a0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, i)), 0); Vector128 a1 = a0.AsByte() + last.AsByte(); - Vector128 a2 = Vector128Utilities.ShiftLeftBytesInVector(a1, 1); + Vector128 a2 = Vector128_.ShiftLeftBytesInVector(a1, 1); Vector128 a3 = a1 + a2; - Vector128 a4 = Vector128Utilities.ShiftLeftBytesInVector(a3, 2); + Vector128 a4 = Vector128_.ShiftLeftBytesInVector(a3, 2); Vector128 a5 = a3 + a4; - Vector128 a6 = Vector128Utilities.ShiftLeftBytesInVector(a5, 4); + Vector128 a6 = Vector128_.ShiftLeftBytesInVector(a5, 4); Vector128 a7 = a5 + a6; ref byte outputRef = ref Unsafe.Add(ref dstRef, i); diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs index 0dc6d26bc..dbd255722 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs @@ -8,6 +8,7 @@ using SixLabors.ImageSharp.Tests; namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg; +[Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class DecodeJpeg { private JpegDecoder decoder; @@ -21,7 +22,7 @@ public class DecodeJpeg this.preloadedImageStream = new MemoryStream(bytes); } - private void GenericBechmark() + private void GenericBenchmark() { this.preloadedImageStream.Position = 0; using Image img = this.decoder.Decode(DecoderOptions.Default, this.preloadedImageStream); @@ -51,16 +52,16 @@ public class DecodeJpeg } [Benchmark(Description = "Baseline 4:4:4 Interleaved")] - public void JpegBaselineInterleaved444() => this.GenericBechmark(); + public void JpegBaselineInterleaved444() => this.GenericBenchmark(); [Benchmark(Description = "Baseline 4:2:0 Interleaved")] - public void JpegBaselineInterleaved420() => this.GenericBechmark(); + public void JpegBaselineInterleaved420() => this.GenericBenchmark(); [Benchmark(Description = "Baseline 4:0:0 (grayscale)")] - public void JpegBaseline400() => this.GenericBechmark(); + public void JpegBaseline400() => this.GenericBenchmark(); [Benchmark(Description = "Progressive 4:2:0 Non-Interleaved")] - public void JpegProgressiveNonInterleaved420() => this.GenericBechmark(); + public void JpegProgressiveNonInterleaved420() => this.GenericBenchmark(); } /* diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs index e21d0c76d..9fd48301e 100644 --- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs +++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs @@ -34,6 +34,7 @@ public partial class Config // like `LZCNT`, `BMI1`, or `BMI2` // `EnableSSE3_4` is a legacy switch that exists for compat and is basically the same as `EnableSSE3` private const string EnableAES = "DOTNET_EnableAES"; + private const string EnableAVX512F = "DOTNET_EnableAVX512F"; private const string EnableAVX = "DOTNET_EnableAVX"; private const string EnableAVX2 = "DOTNET_EnableAVX2"; private const string EnableBMI1 = "DOTNET_EnableBMI1"; @@ -76,4 +77,36 @@ public partial class Config } } } + + public class HwIntrinsics_SSE_AVX_AVX512F : Config + { + public HwIntrinsics_SSE_AVX_AVX512F() + { + this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80) + .WithEnvironmentVariables( + new EnvironmentVariable(EnableHWIntrinsic, Off), + new EnvironmentVariable(FeatureSIMD, Off)) + .WithId("1. No HwIntrinsics").AsBaseline()); + + if (Sse.IsSupported) + { + this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80) + .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) + .WithId("2. SSE")); + } + + if (Avx.IsSupported) + { + this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80) + .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX512F, Off)) + .WithId("3. AVX")); + } + + if (Avx512F.IsSupported) + { + this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80) + .WithId("3. AVX512F")); + } + } + } } From 30bdc29e4060bea18832c9e6e905398ed1d8c02a Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 7 May 2025 15:14:58 +1000 Subject: [PATCH 04/12] Migrate from Sse to general Vector128 for ZigZag --- .../Jpeg/Components/Block8x8F.Vector128.cs | 13 +- .../Jpeg/Components/Block8x8F.Vector256.cs | 2 +- .../Formats/Jpeg/Components/Block8x8F.cs | 12 +- .../Jpeg/Components/ZigZag.Intrinsic.cs | 135 ++++++++++-------- 4 files changed, 90 insertions(+), 72 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs index 37332db62..8e0d526e5 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs @@ -3,7 +3,6 @@ using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Jpeg.Components; @@ -45,22 +44,20 @@ internal partial struct Block8x8F private static Vector128 NormalizeAndRoundVector128(Vector128 value, Vector128 off, Vector128 max) => Vector128_.RoundToNearestInteger(Vector128_.Clamp(value + off, Vector128.Zero, max)); - private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + private static void MultiplyIntoInt16Vector128(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { - DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!"); + DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!"); ref Vector128 aBase = ref Unsafe.As>(ref a); ref Vector128 bBase = ref Unsafe.As>(ref b); - ref Vector128 destBase = ref Unsafe.As>(ref dest); - // TODO: We can use the v128 utilities for this. for (nuint i = 0; i < 16; i += 2) { - Vector128 left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector128 right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + Vector128 left = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0)); + Vector128 right = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1)); - Unsafe.Add(ref destBase, i / 2) = Sse2.PackSignedSaturate(left, right); + Unsafe.Add(ref destBase, i / 2) = Vector128_.PackSignedSaturate(left, right); } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs index a7d5c89b3..3aab547e0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs @@ -121,7 +121,7 @@ internal partial struct Block8x8F } } - private void TransposeInplace_Avx() + private void TransposeInPlace_Avx() { // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 Vector256 r0 = Avx.InsertVector128( diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index ec563897d..284c5bfe5 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -282,10 +282,10 @@ internal partial struct Block8x8F : IEquatable MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest); ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest); } - else if (Ssse3.IsSupported) + else if (Vector128.IsHardwareAccelerated) { - MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest); - ZigZag.ApplyTransposingZigZagOrderingSsse3(ref dest); + MultiplyIntoInt16Vector128(ref block, ref qt, ref dest); + ZigZag.ApplyTransposingZigZagOrderingVector128(ref dest); } else { @@ -387,7 +387,7 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public void LoadFrom(ref Block8x8 source) { - if (SimdUtils.HasVector8) + if (Avx2.IsSupported) { this.LoadFromInt16ExtendedAvx2(ref source); return; @@ -483,6 +483,7 @@ internal partial struct Block8x8F : IEquatable /// Value to compare to. public bool EqualsToScalar(int value) { + // TODO: Can we provide a Vector128 implementation for this? if (Avx2.IsSupported) { const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); @@ -585,10 +586,11 @@ internal partial struct Block8x8F : IEquatable { if (Avx.IsSupported) { - this.TransposeInplace_Avx(); + this.TransposeInPlace_Avx(); } else { + // TODO: Can we provide a Vector128 implementation for this? this.TransposeInPlace_Scalar(); } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs index f6239ad1e..941edb5c0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -1,6 +1,9 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -17,11 +20,11 @@ internal static partial class ZigZag #pragma warning restore SA1309 /// - /// Gets shuffle vectors for + /// Gets shuffle vectors for /// zig zag implementation. /// - private static ReadOnlySpan SseShuffleMasks => new byte[] - { + private static ReadOnlySpan SseShuffleMasks => + [ #pragma warning disable SA1515 /* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */ // A @@ -83,14 +86,14 @@ internal static partial class ZigZag // H _, _, _, _, _, _, _, _, 10, 11, 12, 13, _, _, 14, 15, #pragma warning restore SA1515 - }; + ]; /// /// Gets shuffle vectors for /// zig zag implementation. /// - private static ReadOnlySpan AvxShuffleMasks => new byte[] - { + private static ReadOnlySpan AvxShuffleMasks => + [ #pragma warning disable SA1515 /* 01 */ // [cr] crln_01_AB_CD @@ -138,15 +141,15 @@ internal static partial class ZigZag // (in) GH _, _, _, _, _, _, _, _, 0, 1, 10, 11, 12, 13, 2, 3, _, _, _, _, _, _, 0, 1, 6, 7, 8, 9, 2, 3, 10, 11, #pragma warning restore SA1515 - }; + ]; /// - /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics. + /// Applies zig zag ordering for given 8x8 matrix using cpu intrinsics. /// /// Input matrix. - public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block) + public static unsafe void ApplyTransposingZigZagOrderingVector128(ref Block8x8 block) { - DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); + DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!"); fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks)) { @@ -160,68 +163,68 @@ internal static partial class ZigZag Vector128 rowH = block.V7.AsByte(); // row0 - A0 B0 A1 A2 B1 C0 D0 C1 - Vector128 row0_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 0))).AsInt16(); - Vector128 row0_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 1))).AsInt16(); - Vector128 row0_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 2))).AsInt16(); - Vector128 row0 = Sse2.Or(Sse2.Or(row0_A, row0_B), row0_C); - row0 = Sse2.Insert(row0.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 0), 6).AsInt16(); + Vector128 row0_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 0))).AsInt16(); + Vector128 row0_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 1))).AsInt16(); + Vector128 row0_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 2))).AsInt16(); + Vector128 row0 = row0_A | row0_B | row0_C; + row0 = row0.AsUInt16().WithElement(6, rowD.AsUInt16().GetElement(0)).AsInt16(); // row1 - B2 A3 A4 B3 C2 D1 E0 F0 - Vector128 row1_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 3))).AsInt16(); - Vector128 row1_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 4))).AsInt16(); - Vector128 row1 = Sse2.Or(row1_A, row1_B); - row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 2), 4).AsInt16(); - row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 1), 5).AsInt16(); - row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 6).AsInt16(); - row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 0), 7).AsInt16(); + Vector128 row1_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 3))).AsInt16(); + Vector128 row1_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 4))).AsInt16(); + Vector128 row1 = row1_A | row1_B; + row1 = row1.AsUInt16().WithElement(4, rowC.AsUInt16().GetElement(2)).AsInt16(); + row1 = row1.AsUInt16().WithElement(5, rowD.AsUInt16().GetElement(1)).AsInt16(); + row1 = row1.AsUInt16().WithElement(6, rowE.AsUInt16().GetElement(0)).AsInt16(); + row1 = row1.AsUInt16().WithElement(7, rowF.AsUInt16().GetElement(0)).AsInt16(); // row2 - E1 D2 C3 B4 A5 A6 B5 C4 - Vector128 row2_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 5))).AsInt16(); - Vector128 row2_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 6))).AsInt16(); - Vector128 row2_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 7))).AsInt16(); - Vector128 row2 = Sse2.Or(Sse2.Or(row2_A, row2_B), row2_C); - row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 1).AsInt16(); - row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 1), 0).AsInt16(); + Vector128 row2_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 5))).AsInt16(); + Vector128 row2_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 6))).AsInt16(); + Vector128 row2_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 7))).AsInt16(); + Vector128 row2 = row2_A | row2_B | row2_C; + row2 = row2.AsUInt16().WithElement(1, rowD.AsUInt16().GetElement(2)).AsInt16(); + row2 = row2.AsUInt16().WithElement(0, rowE.AsUInt16().GetElement(1)).AsInt16(); // row3 - D3 E2 F1 G0 H0 G1 F2 E3 - Vector128 row3_E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 8))).AsInt16(); - Vector128 row3_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 9))).AsInt16(); - Vector128 row3_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 10))).AsInt16(); - Vector128 row3 = Sse2.Or(Sse2.Or(row3_E, row3_F), row3_G); - row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 3), 0).AsInt16(); - row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowH.AsUInt16(), 0), 4).AsInt16(); + Vector128 row3_E = ZShuffle(rowE, Vector128.Load(shuffleVectorsPtr + (16 * 8))).AsInt16(); + Vector128 row3_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 9))).AsInt16(); + Vector128 row3_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 10))).AsInt16(); + Vector128 row3 = row3_E | row3_F | row3_G; + row3 = row3.AsUInt16().WithElement(0, rowD.AsUInt16().GetElement(3)).AsInt16(); + row3 = row3.AsUInt16().WithElement(4, rowH.AsUInt16().GetElement(0)).AsInt16(); // row4 - D4 C5 B6 A7 B7 C6 D5 E4 - Vector128 row4_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 11))).AsInt16(); - Vector128 row4_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 12))).AsInt16(); - Vector128 row4_D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 13))).AsInt16(); - Vector128 row4 = Sse2.Or(Sse2.Or(row4_B, row4_C), row4_D); - row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowA.AsUInt16(), 7), 3).AsInt16(); - row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 4), 7).AsInt16(); + Vector128 row4_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 11))).AsInt16(); + Vector128 row4_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 12))).AsInt16(); + Vector128 row4_D = ZShuffle(rowD, Vector128.Load(shuffleVectorsPtr + (16 * 13))).AsInt16(); + Vector128 row4 = row4_B | row4_C | row4_D; + row4 = row4.AsUInt16().WithElement(3, rowA.AsUInt16().GetElement(7)).AsInt16(); + row4 = row4.AsUInt16().WithElement(7, rowE.AsUInt16().GetElement(4)).AsInt16(); // row5 - F3 G2 H1 H2 G3 F4 E5 D6 - Vector128 row5_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 14))).AsInt16(); - Vector128 row5_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 15))).AsInt16(); - Vector128 row5_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 16))).AsInt16(); - Vector128 row5 = Sse2.Or(Sse2.Or(row5_F, row5_G), row5_H); - row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 6), 7).AsInt16(); - row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 6).AsInt16(); + Vector128 row5_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 14))).AsInt16(); + Vector128 row5_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 15))).AsInt16(); + Vector128 row5_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 16))).AsInt16(); + Vector128 row5 = row5_F | row5_G | row5_H; + row5 = row5.AsUInt16().WithElement(7, rowD.AsUInt16().GetElement(6)).AsInt16(); + row5 = row5.AsUInt16().WithElement(6, rowE.AsUInt16().GetElement(5)).AsInt16(); // row6 - C7 D7 E6 F5 G4 H3 H4 G5 - Vector128 row6_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 17))).AsInt16(); - Vector128 row6_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 18))).AsInt16(); - Vector128 row6 = Sse2.Or(row6_G, row6_H); - row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 7), 0).AsInt16(); - row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 1).AsInt16(); - row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 6), 2).AsInt16(); - row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 5), 3).AsInt16(); + Vector128 row6_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 17))).AsInt16(); + Vector128 row6_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 18))).AsInt16(); + Vector128 row6 = row6_G | row6_H; + row6 = row6.AsUInt16().WithElement(0, rowC.AsUInt16().GetElement(7)).AsInt16(); + row6 = row6.AsUInt16().WithElement(1, rowD.AsUInt16().GetElement(7)).AsInt16(); + row6 = row6.AsUInt16().WithElement(2, rowE.AsUInt16().GetElement(6)).AsInt16(); + row6 = row6.AsUInt16().WithElement(3, rowF.AsUInt16().GetElement(5)).AsInt16(); // row7 - F6 E7 F7 G6 H5 H6 G7 H7 - Vector128 row7_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 19))).AsInt16(); - Vector128 row7_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 20))).AsInt16(); - Vector128 row7_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 21))).AsInt16(); - Vector128 row7 = Sse2.Or(Sse2.Or(row7_F, row7_G), row7_H); - row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 7), 1).AsInt16(); + Vector128 row7_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 19))).AsInt16(); + Vector128 row7_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 20))).AsInt16(); + Vector128 row7_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 21))).AsInt16(); + Vector128 row7 = row7_F | row7_G | row7_H; + row7 = row7.AsUInt16().WithElement(1, rowE.AsUInt16().GetElement(7)).AsInt16(); block.V0 = row0; block.V1 = row1; @@ -300,4 +303,20 @@ internal static partial class ZigZag block.V67 = row67.AsInt16(); } } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 ZShuffle(Vector128 source, Vector128 mask) + { + // For x64 we use the SSSE3 shuffle intrinsic to avoid additional instructions. 3 vs 1. + if (Ssse3.IsSupported) + { + return Ssse3.Shuffle(source, mask); + } + + // For ARM and WASM, codegen will be optimal. + return Vector128.Shuffle(source, mask); + } + + [DoesNotReturn] + private static void ThrowUnreachableException() => throw new UnreachableException(); } From 041e59dbce345157af491dda7afd03c7a60016bf Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 7 May 2025 16:20:01 +1000 Subject: [PATCH 05/12] All Vector128 Load --- .../Jpeg/Components/Block8x8F.Vector128.cs | 30 ++++++++++++++++ .../Formats/Jpeg/Components/Block8x8F.cs | 5 +++ .../Formats/Jpg/Block8x8FTests.cs | 35 +++++++++++++++---- 3 files changed, 63 insertions(+), 7 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs index 8e0d526e5..ffd405714 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs @@ -40,6 +40,36 @@ internal partial struct Block8x8F this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4(); } + /// + /// Loads values from using extended AVX2 intrinsics. + /// + /// The source + public void LoadFromInt16ExtendedVector128(ref Block8x8 source) + { + DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!"); + + ref Vector128 srcBase = ref Unsafe.As>(ref source); + ref Vector128 destBase = ref Unsafe.As>(ref this); + + // Only 8 iterations, one per 128b short block + for (nuint i = 0; i < 8; i++) + { + Vector128 src = Unsafe.Add(ref srcBase, i); + + // Step 1: Widen short -> int + Vector128 lower = Vector128.WidenLower(src); // lower 4 shorts -> 4 ints + Vector128 upper = Vector128.WidenUpper(src); // upper 4 shorts -> 4 ints + + // Step 2: Convert int -> float + Vector128 lowerF = Vector128.ConvertToSingle(lower); + Vector128 upperF = Vector128.ConvertToSingle(upper); + + // Step 3: Store to destination (this is 16 lanes -> two Vector128 blocks) + Unsafe.Add(ref destBase, (i * 2) + 0) = lowerF; + Unsafe.Add(ref destBase, (i * 2) + 1) = upperF; + } + } + [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 NormalizeAndRoundVector128(Vector128 value, Vector128 off, Vector128 max) => Vector128_.RoundToNearestInteger(Vector128_.Clamp(value + off, Vector128.Zero, max)); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 284c5bfe5..f7ef44384 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -392,6 +392,11 @@ internal partial struct Block8x8F : IEquatable this.LoadFromInt16ExtendedAvx2(ref source); return; } + else if (Vector128.IsHardwareAccelerated) + { + this.LoadFromInt16ExtendedVector128(ref source); + return; + } this.LoadFromInt16Scalar(ref source); } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index 7b73c0c52..1c5d15dc2 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -55,7 +55,7 @@ public partial class Block8x8FTests : JpegFixture Times, () => { - var block = default(Block8x8F); + Block8x8F block = default; for (int i = 0; i < Block8x8F.Size; i++) { @@ -68,7 +68,7 @@ public partial class Block8x8FTests : JpegFixture sum += block[i]; } }); - Assert.Equal(sum, 64f * 63f * 0.5f); + Assert.Equal(64f * 63f * 0.5f, sum); } [Fact] @@ -93,7 +93,7 @@ public partial class Block8x8FTests : JpegFixture sum += block[i]; } }); - Assert.Equal(sum, 64f * 63f * 0.5f); + Assert.Equal(64f * 63f * 0.5f, sum); } [Fact] @@ -121,7 +121,7 @@ public partial class Block8x8FTests : JpegFixture } [Fact] - public void TransposeInplace() + public void TransposeInPlace() { static void RunTest() { @@ -276,7 +276,7 @@ public partial class Block8x8FTests : JpegFixture float[] data = Create8x8RandomFloatData(-1000, 1000); Block8x8F source = Block8x8F.Load(data); - var dest = default(Block8x8); + Block8x8 dest = default; source.RoundInto(ref dest); @@ -388,7 +388,7 @@ public partial class Block8x8FTests : JpegFixture short[] data = Create8x8ShortData(); - var source = Block8x8.Load(data); + Block8x8 source = Block8x8.Load(data); Block8x8F dest = default; dest.LoadFromInt16Scalar(ref source); @@ -399,6 +399,27 @@ public partial class Block8x8FTests : JpegFixture } } + [Fact] + public void LoadFromUInt16ExtendedVector128() + { + if (this.SkipOnNonVector128Runner()) + { + return; + } + + short[] data = Create8x8ShortData(); + + Block8x8 source = Block8x8.Load(data); + + Block8x8F dest = default; + dest.LoadFromInt16ExtendedVector128(ref source); + + for (int i = 0; i < Block8x8F.Size; i++) + { + Assert.Equal(data[i], dest[i]); + } + } + [Fact] public void LoadFromUInt16ExtendedAvx2() { @@ -409,7 +430,7 @@ public partial class Block8x8FTests : JpegFixture short[] data = Create8x8ShortData(); - var source = Block8x8.Load(data); + Block8x8 source = Block8x8.Load(data); Block8x8F dest = default; dest.LoadFromInt16ExtendedAvx2(ref source); From 038f047a1c6839383acb47ed1f4b8f242d53812b Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 8 May 2025 15:39:16 +1000 Subject: [PATCH 06/12] Initial fixes based on feedback --- .../Jpeg/Components/Block8x8F.Vector128.cs | 3 ++- .../Jpeg/Components/Block8x8F.Vector256.cs | 2 +- .../Formats/Jpeg/Components/Block8x8F.cs | 24 ++++++++++++++----- .../Formats/Jpg/Block8x8FTests.cs | 2 +- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs index ffd405714..3daa47693 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs @@ -1,6 +1,7 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using SixLabors.ImageSharp.Common.Helpers; @@ -19,8 +20,8 @@ internal partial struct Block8x8F [MethodImpl(InliningOptions.ShortMethod)] public void NormalizeColorsAndRoundInPlaceVector128(float maximum) { - Vector128 off = Vector128.Create(MathF.Ceiling(maximum * 0.5F)); Vector128 max = Vector128.Create(maximum); + Vector128 off = Vector128.Ceiling(max * .5F); this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4(); this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4(); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs index 3aab547e0..4e4133496 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs @@ -46,8 +46,8 @@ internal partial struct Block8x8F [MethodImpl(InliningOptions.ShortMethod)] public void NormalizeColorsAndRoundInPlaceVector256(float maximum) { - Vector256 off = Vector256.Create(MathF.Ceiling(maximum * 0.5F)); Vector256 max = Vector256.Create(maximum); + Vector256 off = Vector256.Ceiling(max * .5F); this.V256_0 = NormalizeAndRoundVector256(this.V256_0, off, max); this.V256_1 = NormalizeAndRoundVector256(this.V256_1, off, max); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index f7ef44384..6f9b4fd16 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -488,18 +488,30 @@ internal partial struct Block8x8F : IEquatable /// Value to compare to. public bool EqualsToScalar(int value) { - // TODO: Can we provide a Vector128 implementation for this? - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { - const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); - Vector256 targetVector = Vector256.Create(value); ref Vector256 blockStride = ref this.V256_0; for (nuint i = 0; i < RowCount; i++) { - Vector256 areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V256_0, i)), targetVector); - if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask) + if (!Vector256.EqualsAll(Vector256.ConvertToInt32(Unsafe.Add(ref this.V256_0, i)), targetVector)) + { + return false; + } + } + + return true; + } + + if (Vector128.IsHardwareAccelerated) + { + Vector128 targetVector = Vector128.Create(value); + ref Vector4 blockStride = ref this.V0L; + + for (nuint i = 0; i < RowCount * 2; i++) + { + if (!Vector128.EqualsAll(Vector128.ConvertToInt32(Unsafe.Add(ref this.V0L, i).AsVector128()), targetVector)) { return false; } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index 1c5d15dc2..d1ade761c 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -462,7 +462,7 @@ public partial class Block8x8FTests : JpegFixture // 3. DisableAvx2 - call fallback code of float implementation FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); } [Theory] From 8a23d42bfdd6a1aaa68dc64870458e2514573ce5 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 8 May 2025 20:56:50 +1000 Subject: [PATCH 07/12] Port more V256 code --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 124 +++++++++--------- .../Common/Helpers/Vector128Utilities.cs | 15 ++- .../Common/Helpers/Vector256Utilities.cs | 58 +++++--- .../Common/Helpers/Vector512Utilities.cs | 17 +-- .../Jpeg/Components/Block8x8F.Vector256.cs | 112 ++++++---------- .../Formats/Jpeg/Components/Block8x8F.cs | 12 +- .../Block8x8F_LoadFromInt16.cs | 2 +- .../Formats/Jpg/Block8x8FTests.cs | 2 +- 8 files changed, 164 insertions(+), 178 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 449dc37d0..8533b2151 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -66,9 +66,9 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat) || - (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat) || - (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat)) + if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) || + (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) || + (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat)) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -112,9 +112,9 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte) || + if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) || (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte) || - (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte)) + (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -158,7 +158,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign) { int remainder = source.Length % (Vector128.Count * 3); @@ -190,7 +190,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) { int remainder = source.Length % (Vector128.Count * 3); @@ -223,7 +223,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) { int remainder = source.Length & ((Vector128.Count * 4) - 1); // bit-hack for modulo @@ -249,7 +249,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat) + if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) { ref Vector512 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector512 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -263,21 +263,21 @@ internal static partial class SimdUtils ref Vector512 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector512 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector512_.Shuffle(vs0, control); - Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); - Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); - Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); + vd0 = Vector512_.ShuffleNative(vs0, control); + Unsafe.Add(ref vd0, (nuint)1) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control); + Unsafe.Add(ref vd0, (nuint)2) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control); + Unsafe.Add(ref vd0, (nuint)3) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), control); + Unsafe.Add(ref destinationBase, i) = Vector512_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control); } } } - else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat) + else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) { ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -291,21 +291,21 @@ internal static partial class SimdUtils ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector256 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector256_.Shuffle(vs0, control); - Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); - Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); - Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); + vd0 = Vector256_.ShuffleNative(vs0, control); + Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control); + Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control); + Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), control); + Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control); } } } - else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat) + else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat) { ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector128 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -319,17 +319,17 @@ internal static partial class SimdUtils ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector128 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector128_.Shuffle(vs0, control); - Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); - Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); - Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); + vd0 = Vector128_.ShuffleNative(vs0, control); + Unsafe.Add(ref vd0, (nuint)1) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control); + Unsafe.Add(ref vd0, (nuint)2) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control); + Unsafe.Add(ref vd0, (nuint)3) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), control); + Unsafe.Add(ref destinationBase, i) = Vector128_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control); } } } @@ -341,7 +341,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte) + if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) { Span temp = stackalloc byte[Vector512.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -359,17 +359,17 @@ internal static partial class SimdUtils ref Vector512 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector512 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector512_.Shuffle(vs0, mask); - Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); - Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); - Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); + vd0 = Vector512_.ShuffleNative(vs0, mask); + Unsafe.Add(ref vd0, (nuint)1) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask); + Unsafe.Add(ref vd0, (nuint)2) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask); + Unsafe.Add(ref vd0, (nuint)3) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), mask); + Unsafe.Add(ref destinationBase, i) = Vector512_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask); } } } @@ -391,21 +391,21 @@ internal static partial class SimdUtils ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector256 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector256_.Shuffle(vs0, mask); - Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); - Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); - Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); + vd0 = Vector256_.ShuffleNative(vs0, mask); + Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask); + Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask); + Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), mask); + Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask); } } } - else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte) + else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte) { Span temp = stackalloc byte[Vector128.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -423,17 +423,17 @@ internal static partial class SimdUtils ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector128 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector128_.Shuffle(vs0, mask); - Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); - Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); - Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); + vd0 = Vector128_.ShuffleNative(vs0, mask); + Unsafe.Add(ref vd0, (nuint)1) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask); + Unsafe.Add(ref vd0, (nuint)2) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask); + Unsafe.Add(ref vd0, (nuint)3) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), mask); + Unsafe.Add(ref destinationBase, i) = Vector128_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask); } } } @@ -445,7 +445,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); @@ -472,15 +472,15 @@ internal static partial class SimdUtils v2 = Vector128_.AlignRight(v2, v1, 8); v1 = Vector128_.AlignRight(v1, v0, 12); - v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16), mask); - v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16), mask); - v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16), mask); - v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16), mask); + v0 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, maskPad4Nx16), mask); + v1 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, maskPad4Nx16), mask); + v2 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, maskPad4Nx16), mask); + v3 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, maskPad4Nx16), mask); - v0 = Vector128_.Shuffle(v0, maskE); - v1 = Vector128_.Shuffle(v1, maskSlice4Nx16); - v2 = Vector128_.Shuffle(v2, maskE); - v3 = Vector128_.Shuffle(v3, maskSlice4Nx16); + v0 = Vector128_.ShuffleNative(v0, maskE); + v1 = Vector128_.ShuffleNative(v1, maskSlice4Nx16); + v2 = Vector128_.ShuffleNative(v2, maskE); + v3 = Vector128_.ShuffleNative(v3, maskSlice4Nx16); v0 = Vector128_.AlignRight(v1, v0, 4); v3 = Vector128_.AlignRight(v3, v2, 12); @@ -505,7 +505,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 fill = Vector128.Create(0xff000000ff000000ul).AsByte(); @@ -534,10 +534,10 @@ internal static partial class SimdUtils ref Vector128 vd = ref Unsafe.Add(ref destinationBase, j); - vd = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16) | fill, mask); - Unsafe.Add(ref vd, 1) = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16) | fill, mask); - Unsafe.Add(ref vd, 2) = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16) | fill, mask); - Unsafe.Add(ref vd, 3) = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16) | fill, mask); + vd = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, maskPad4Nx16) | fill, mask); + Unsafe.Add(ref vd, 1) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, maskPad4Nx16) | fill, mask); + Unsafe.Add(ref vd, 2) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, maskPad4Nx16) | fill, mask); + Unsafe.Add(ref vd, 3) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, maskPad4Nx16) | fill, mask); } } } @@ -548,7 +548,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) { Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); Vector128 maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); @@ -574,10 +574,10 @@ internal static partial class SimdUtils Vector128 v2 = Unsafe.Add(ref vs, 2); Vector128 v3 = Unsafe.Add(ref vs, 3); - v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, mask), maskE); - v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, mask), maskSlice4Nx16); - v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, mask), maskE); - v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, mask), maskSlice4Nx16); + v0 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, mask), maskE); + v1 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, mask), maskSlice4Nx16); + v2 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, mask), maskE); + v3 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, mask), maskSlice4Nx16); v0 = Vector128_.AlignRight(v1, v0, 4); v3 = Vector128_.AlignRight(v3, v2, 12); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 85b09b351..3471acbd3 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -26,7 +26,7 @@ internal static class Vector128_ /// /// Gets a value indicating whether shuffle operations are supported. /// - public static bool SupportsShuffleFloat + public static bool SupportsShuffleNativeFloat { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Sse.IsSupported; @@ -35,10 +35,10 @@ internal static class Vector128_ /// /// Gets a value indicating whether shuffle operations are supported. /// - public static bool SupportsShuffleByte + public static bool SupportsShuffleNativeByte { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported; + get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported; } /// @@ -66,7 +66,7 @@ internal static class Vector128_ /// The shuffle control byte. /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 Shuffle(Vector128 vector, [ConstantExpected] byte control) + public static Vector128 ShuffleNative(Vector128 vector, [ConstantExpected] byte control) { if (Sse.IsSupported) { @@ -89,7 +89,7 @@ internal static class Vector128_ /// A new vector containing the values from selected by the given . /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 Shuffle(Vector128 vector, Vector128 indices) + public static Vector128 ShuffleNative(Vector128 vector, Vector128 indices) { if (Ssse3.IsSupported) { @@ -101,6 +101,11 @@ internal static class Vector128_ return AdvSimd.Arm64.VectorTableLookup(vector, indices); } + if (PackedSimd.IsSupported) + { + return PackedSimd.Swizzle(vector, indices); + } + ThrowUnreachableException(); return default; } diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 893b6240d..8b22a5137 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -24,10 +24,10 @@ internal static class Vector256_ /// /// Gets a value indicating whether shuffle byte operations are supported. /// - public static bool SupportsShuffleFloat + public static bool SupportsShuffleNativeFloat { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx.IsSupported || Sse.IsSupported; + get => Avx.IsSupported; } /// @@ -46,20 +46,13 @@ internal static class Vector256_ /// The shuffle control byte. /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 Shuffle(Vector256 vector, [ConstantExpected] byte control) + public static Vector256 ShuffleNative(Vector256 vector, [ConstantExpected] byte control) { if (Avx.IsSupported) { return Avx.Shuffle(vector, vector, control); } - if (Sse.IsSupported) - { - Vector128 lower = vector.GetLower(); - Vector128 upper = vector.GetUpper(); - return Vector256.Create(Sse.Shuffle(lower, lower, control), Sse.Shuffle(upper, upper, control)); - } - ThrowUnreachableException(); return default; } @@ -73,7 +66,7 @@ internal static class Vector256_ /// /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 Shuffle(Vector256 vector, Vector256 indices) + public static Vector256 ShuffleNative(Vector256 vector, Vector256 indices) { if (Avx2.IsSupported) { @@ -98,13 +91,6 @@ internal static class Vector256_ return Avx.ConvertToVector256Int32(vector); } - if (Sse2.IsSupported) - { - Vector128 lower = Sse2.ConvertToVector128Int32(vector.GetLower()); - Vector128 upper = Sse2.ConvertToVector128Int32(vector.GetUpper()); - return Vector256.Create(lower, upper); - } - Vector256 sign = vector & Vector256.Create(-0F); Vector256 val_2p23_f32 = sign | Vector256.Create(8388608F); @@ -154,6 +140,27 @@ internal static class Vector256_ return va + (vm0 * vm1); } + /// + /// Packs signed 32-bit integers to signed 16-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 PackSignedSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.PackSignedSaturate(left, right); + } + + Vector256 min = Vector256.Create((int)short.MinValue); + Vector256 max = Vector256.Create((int)short.MaxValue); + Vector256 lefClamped = Clamp(left, min, max); + Vector256 rightClamped = Clamp(right, min, max); + return Vector256.Narrow(lefClamped, rightClamped); + } + /// /// Restricts a vector between a minimum and a maximum value. /// @@ -166,6 +173,21 @@ internal static class Vector256_ public static Vector256 Clamp(Vector256 value, Vector256 min, Vector256 max) => Vector256.Min(Vector256.Max(value, min), max); + /// + /// Widens a to a . + /// + /// The vector to widen. + /// The widened . + public static Vector256 Widen(Vector128 value) + { + if (Avx2.IsSupported) + { + return Avx2.ConvertToVector256Int32(value); + } + + return Vector256.WidenLower(value.ToVector256()); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index 3c773bc52..63de5dc10 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -24,16 +24,16 @@ internal static class Vector512_ /// /// Gets a value indicating whether shuffle float operations are supported. /// - public static bool SupportsShuffleFloat + public static bool SupportsShuffleNativeFloat { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx512F.IsSupported || Avx.IsSupported; + get => Avx512F.IsSupported; } /// /// Gets a value indicating whether shuffle byte operations are supported. /// - public static bool SupportsShuffleByte + public static bool SupportsShuffleNativeByte { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Avx512BW.IsSupported; @@ -46,20 +46,13 @@ internal static class Vector512_ /// The shuffle control byte. /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector512 Shuffle(Vector512 vector, [ConstantExpected] byte control) + public static Vector512 ShuffleNative(Vector512 vector, [ConstantExpected] byte control) { if (Avx512F.IsSupported) { return Avx512F.Shuffle(vector, vector, control); } - if (Avx.IsSupported) - { - Vector256 lower = vector.GetLower(); - Vector256 upper = vector.GetUpper(); - return Vector512.Create(Avx.Shuffle(lower, lower, control), Avx.Shuffle(upper, upper, control)); - } - ThrowUnreachableException(); return default; } @@ -73,7 +66,7 @@ internal static class Vector512_ /// /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector512 Shuffle(Vector512 vector, Vector512 indices) + public static Vector512 ShuffleNative(Vector512 vector, Vector512 indices) { if (Avx512BW.IsSupported) { diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs index 4e4133496..2aaf5c943 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs @@ -1,7 +1,6 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. -using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; @@ -60,109 +59,76 @@ internal partial struct Block8x8F } /// - /// Loads values from using extended AVX2 intrinsics. + /// Loads values from using intrinsics. /// /// The source - public void LoadFromInt16ExtendedAvx2(ref Block8x8 source) + public void LoadFromInt16ExtendedVector256(ref Block8x8 source) { DebugGuard.IsTrue( - Avx2.IsSupported, - "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!"); + Vector256.IsHardwareAccelerated, + "LoadFromInt16ExtendedVector256 only works on Vector256 compatible architecture!"); ref short sRef = ref Unsafe.As(ref source); ref Vector256 dRef = ref Unsafe.As>(ref this); - // Vector256.Count == 16 on AVX2 + // Vector256.Count == 16 // We can process 2 block rows in a single step - Vector256 top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef)); - Vector256 bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256.Count)); - dRef = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom); - - top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 2))); - bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 3))); - Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom); - - top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 4))); - bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 5))); - Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom); - - top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 6))); - bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 7))); - Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom); + Vector256 top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef)); + Vector256 bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256.Count)); + dRef = Vector256.ConvertToSingle(top); + Unsafe.Add(ref dRef, 1) = Vector256.ConvertToSingle(bottom); + + top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 2))); + bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 3))); + Unsafe.Add(ref dRef, 2) = Vector256.ConvertToSingle(top); + Unsafe.Add(ref dRef, 3) = Vector256.ConvertToSingle(bottom); + + top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 4))); + bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 5))); + Unsafe.Add(ref dRef, 4) = Vector256.ConvertToSingle(top); + Unsafe.Add(ref dRef, 5) = Vector256.ConvertToSingle(bottom); + + top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 6))); + bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 7))); + Unsafe.Add(ref dRef, 6) = Vector256.ConvertToSingle(top); + Unsafe.Add(ref dRef, 7) = Vector256.ConvertToSingle(bottom); } [MethodImpl(InliningOptions.ShortMethod)] private static Vector256 NormalizeAndRoundVector256(Vector256 value, Vector256 off, Vector256 max) => Vector256_.RoundToNearestInteger(Vector256_.Clamp(value + off, Vector256.Zero, max)); - private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + private static unsafe void MultiplyIntoInt16Vector256(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { - DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); + DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to run this operation!"); ref Vector256 aBase = ref a.V256_0; ref Vector256 bBase = ref b.V256_0; - ref Vector256 destRef = ref dest.V01; - Vector256 multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); for (nuint i = 0; i < 8; i += 2) { - Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + Vector256 row0 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0)); + Vector256 row1 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1)); - Vector256 row = Avx2.PackSignedSaturate(row0, row1); - row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16(); + Vector256 row = Vector256_.PackSignedSaturate(row0, row1); + row = Vector256.Shuffle(row.AsInt32(), Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7)).AsInt16(); Unsafe.Add(ref destRef, i / 2) = row; } } - private void TransposeInPlace_Avx() + private void TransposeInPlaceVector256() { // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 - Vector256 r0 = Avx.InsertVector128( - this.V256_0, - Unsafe.As>(ref this.V4L), - 1); - - Vector256 r1 = Avx.InsertVector128( - this.V256_1, - Unsafe.As>(ref this.V5L), - 1); - - Vector256 r2 = Avx.InsertVector128( - this.V256_2, - Unsafe.As>(ref this.V6L), - 1); - - Vector256 r3 = Avx.InsertVector128( - this.V256_3, - Unsafe.As>(ref this.V7L), - 1); - - Vector256 r4 = Avx.InsertVector128( - Unsafe.As>(ref this.V0R).ToVector256(), - Unsafe.As>(ref this.V4R), - 1); - - Vector256 r5 = Avx.InsertVector128( - Unsafe.As>(ref this.V1R).ToVector256(), - Unsafe.As>(ref this.V5R), - 1); - - Vector256 r6 = Avx.InsertVector128( - Unsafe.As>(ref this.V2R).ToVector256(), - Unsafe.As>(ref this.V6R), - 1); - - Vector256 r7 = Avx.InsertVector128( - Unsafe.As>(ref this.V3R).ToVector256(), - Unsafe.As>(ref this.V7R), - 1); + Vector256 r0 = this.V256_0.WithUpper(this.V4L.AsVector128()); + Vector256 r1 = this.V256_1.WithUpper(this.V5L.AsVector128()); + Vector256 r2 = this.V256_2.WithUpper(this.V6L.AsVector128()); + Vector256 r3 = this.V256_3.WithUpper(this.V7L.AsVector128()); + Vector256 r4 = this.V0R.AsVector128().ToVector256().WithUpper(this.V4R.AsVector128()); + Vector256 r5 = this.V1R.AsVector128().ToVector256().WithUpper(this.V5R.AsVector128()); + Vector256 r6 = this.V2R.AsVector128().ToVector256().WithUpper(this.V6R.AsVector128()); + Vector256 r7 = this.V3R.AsVector128().ToVector256().WithUpper(this.V7R.AsVector128()); Vector256 t0 = Avx.UnpackLow(r0, r1); Vector256 t2 = Avx.UnpackLow(r2, r3); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 6f9b4fd16..a4a7d3ed0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -277,9 +277,9 @@ internal partial struct Block8x8F : IEquatable /// The quantization table. public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { - MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest); + MultiplyIntoInt16Vector256(ref block, ref qt, ref dest); ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest); } else if (Vector128.IsHardwareAccelerated) @@ -387,9 +387,9 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public void LoadFrom(ref Block8x8 source) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { - this.LoadFromInt16ExtendedAvx2(ref source); + this.LoadFromInt16ExtendedVector256(ref source); return; } else if (Vector128.IsHardwareAccelerated) @@ -601,9 +601,9 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public void TransposeInPlace() { - if (Avx.IsSupported) + if (Vector256.IsHardwareAccelerated) { - this.TransposeInPlace_Avx(); + this.TransposeInPlaceVector256(); } else { diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs index 7a8502c2c..25b5e973e 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs @@ -32,7 +32,7 @@ public class Block8x8F_LoadFromInt16 public void Scalar() => this.destination.LoadFromInt16Scalar(ref this.source); [Benchmark] - public void ExtendedAvx2() => this.destination.LoadFromInt16ExtendedAvx2(ref this.source); + public void ExtendedAvx2() => this.destination.LoadFromInt16ExtendedVector256(ref this.source); // RESULT: // Method | Mean | Error | StdDev | Scaled | diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index d1ade761c..ab205c8a3 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -433,7 +433,7 @@ public partial class Block8x8FTests : JpegFixture Block8x8 source = Block8x8.Load(data); Block8x8F dest = default; - dest.LoadFromInt16ExtendedAvx2(ref source); + dest.LoadFromInt16ExtendedVector256(ref source); for (int i = 0; i < Block8x8F.Size; i++) { From 6238f00895c8440624a8be7476eb0d0be01e38fd Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 8 May 2025 21:31:29 +1000 Subject: [PATCH 08/12] Modernize additional V256 code from review --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 23 --- .../Common/Helpers/Vector256Utilities.cs | 22 +++ .../Formats/Jpeg/Components/Block8x8.cs | 4 +- .../Components/FloatingPointDCT.Intrinsic.cs | 142 ------------------ .../Components/FloatingPointDCT.Vector256.cs | 142 ++++++++++++++++++ .../Jpeg/Components/FloatingPointDCT.cs | 14 +- .../Formats/Jpg/Block8x8Tests.cs | 2 +- .../Jpg/Utils/LibJpegTools.ComponentData.cs | 2 +- 8 files changed, 175 insertions(+), 176 deletions(-) delete mode 100644 src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs create mode 100644 src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 8533b2151..e155e4536 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -619,29 +619,6 @@ internal static partial class SimdUtils return va + (vm0 * vm1); } - /// - /// Performs a multiplication and a subtraction of the . - /// TODO: Fix. The arguments are in a different order to the FMA intrinsic. - /// - /// ret = (vm0 * vm1) - vs - /// The vector to subtract from the intermediate result. - /// The first vector to multiply. - /// The second vector to multiply. - /// The . - [MethodImpl(InliningOptions.ShortMethod)] - public static Vector256 MultiplySubtract( - Vector256 vs, - Vector256 vm0, - Vector256 vm1) - { - if (Fma.IsSupported) - { - return Fma.MultiplySubtract(vm1, vm0, vs); - } - - return Avx.Subtract(Avx.Multiply(vm0, vm1), vs); - } - /// /// Performs a multiplication and a negated addition of the . /// diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 8b22a5137..c835d267d 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -140,6 +140,28 @@ internal static class Vector256_ return va + (vm0 * vm1); } + /// + /// Performs a multiplication and a subtraction of the . + /// + /// ret = (vm0 * vm1) - vs + /// The vector to subtract from the intermediate result. + /// The first vector to multiply. + /// The second vector to multiply. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplySubtract( + Vector256 vs, + Vector256 vm0, + Vector256 vm1) + { + if (Fma.IsSupported) + { + return Fma.MultiplySubtract(vm1, vm0, vs); + } + + return (vm0 * vm1) - vs; + } + /// /// Packs signed 32-bit integers to signed 16-bit integers and saturates. /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index 01d112bd6..731ad0f76 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -211,10 +211,10 @@ internal partial struct Block8x8 } /// - /// Transpose the block inplace. + /// Transpose the block in place. /// [MethodImpl(InliningOptions.ShortMethod)] - public void TransposeInplace() + public void TransposeInPlace() { ref short elemRef = ref Unsafe.As(ref this); diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs deleted file mode 100644 index 862c77469..000000000 --- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Six Labors Split License. - -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; - -namespace SixLabors.ImageSharp.Formats.Jpeg.Components; - -internal static partial class FloatingPointDCT -{ - /// - /// Apply floating point FDCT inplace using simd operations. - /// - /// Input block. - private static void FDCT8x8_Avx(ref Block8x8F block) - { - DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); - - // First pass - process columns - FDCT8x8_1D_Avx(ref block); - - // Second pass - process rows - block.TransposeInPlace(); - FDCT8x8_1D_Avx(ref block); - - // Applies 1D floating point FDCT inplace - static void FDCT8x8_1D_Avx(ref Block8x8F block) - { - Vector256 tmp0 = Avx.Add(block.V256_0, block.V256_7); - Vector256 tmp7 = Avx.Subtract(block.V256_0, block.V256_7); - Vector256 tmp1 = Avx.Add(block.V256_1, block.V256_6); - Vector256 tmp6 = Avx.Subtract(block.V256_1, block.V256_6); - Vector256 tmp2 = Avx.Add(block.V256_2, block.V256_5); - Vector256 tmp5 = Avx.Subtract(block.V256_2, block.V256_5); - Vector256 tmp3 = Avx.Add(block.V256_3, block.V256_4); - Vector256 tmp4 = Avx.Subtract(block.V256_3, block.V256_4); - - // Even part - Vector256 tmp10 = Avx.Add(tmp0, tmp3); - Vector256 tmp13 = Avx.Subtract(tmp0, tmp3); - Vector256 tmp11 = Avx.Add(tmp1, tmp2); - Vector256 tmp12 = Avx.Subtract(tmp1, tmp2); - - block.V256_0 = Avx.Add(tmp10, tmp11); - block.V256_4 = Avx.Subtract(tmp10, tmp11); - - var mm256_F_0_7071 = Vector256.Create(0.707106781f); - Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); - block.V256_2 = Avx.Add(tmp13, z1); - block.V256_6 = Avx.Subtract(tmp13, z1); - - // Odd part - tmp10 = Avx.Add(tmp4, tmp5); - tmp11 = Avx.Add(tmp5, tmp6); - tmp12 = Avx.Add(tmp6, tmp7); - - Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), Vector256.Create(0.382683433f)); // mm256_F_0_3826 - Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411 - Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065 - Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071); - - Vector256 z11 = Avx.Add(tmp7, z3); - Vector256 z13 = Avx.Subtract(tmp7, z3); - - block.V256_5 = Avx.Add(z13, z2); - block.V256_3 = Avx.Subtract(z13, z2); - block.V256_1 = Avx.Add(z11, z4); - block.V256_7 = Avx.Subtract(z11, z4); - } - } - - /// - /// Apply floating point IDCT inplace using simd operations. - /// - /// Transposed input block. - private static void IDCT8x8_Avx(ref Block8x8F transposedBlock) - { - DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); - - // First pass - process columns - IDCT8x8_1D_Avx(ref transposedBlock); - - // Second pass - process rows - transposedBlock.TransposeInPlace(); - IDCT8x8_1D_Avx(ref transposedBlock); - - // Applies 1D floating point FDCT inplace - static void IDCT8x8_1D_Avx(ref Block8x8F block) - { - // Even part - Vector256 tmp0 = block.V256_0; - Vector256 tmp1 = block.V256_2; - Vector256 tmp2 = block.V256_4; - Vector256 tmp3 = block.V256_6; - - Vector256 z5 = tmp0; - Vector256 tmp10 = Avx.Add(z5, tmp2); - Vector256 tmp11 = Avx.Subtract(z5, tmp2); - - var mm256_F_1_4142 = Vector256.Create(1.414213562f); - Vector256 tmp13 = Avx.Add(tmp1, tmp3); - Vector256 tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142); - - tmp0 = Avx.Add(tmp10, tmp13); - tmp3 = Avx.Subtract(tmp10, tmp13); - tmp1 = Avx.Add(tmp11, tmp12); - tmp2 = Avx.Subtract(tmp11, tmp12); - - // Odd part - Vector256 tmp4 = block.V256_1; - Vector256 tmp5 = block.V256_3; - Vector256 tmp6 = block.V256_5; - Vector256 tmp7 = block.V256_7; - - Vector256 z13 = Avx.Add(tmp6, tmp5); - Vector256 z10 = Avx.Subtract(tmp6, tmp5); - Vector256 z11 = Avx.Add(tmp4, tmp7); - Vector256 z12 = Avx.Subtract(tmp4, tmp7); - - tmp7 = Avx.Add(z11, z13); - tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142); - - z5 = Avx.Multiply(Avx.Add(z10, z12), Vector256.Create(1.847759065f)); // mm256_F_1_8477 - - tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823 - tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131 - - tmp6 = Avx.Subtract(tmp12, tmp7); - tmp5 = Avx.Subtract(tmp11, tmp6); - tmp4 = Avx.Subtract(tmp10, tmp5); - - block.V256_0 = Avx.Add(tmp0, tmp7); - block.V256_7 = Avx.Subtract(tmp0, tmp7); - block.V256_1 = Avx.Add(tmp1, tmp6); - block.V256_6 = Avx.Subtract(tmp1, tmp6); - block.V256_2 = Avx.Add(tmp2, tmp5); - block.V256_5 = Avx.Subtract(tmp2, tmp5); - block.V256_3 = Avx.Add(tmp3, tmp4); - block.V256_4 = Avx.Subtract(tmp3, tmp4); - } - } -} diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs new file mode 100644 index 000000000..bcd8c7043 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs @@ -0,0 +1,142 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Runtime.Intrinsics; +using SixLabors.ImageSharp.Common.Helpers; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +internal static partial class FloatingPointDCT +{ + /// + /// Apply floating point FDCT in place using simd operations. + /// + /// Input block. + private static void FDCT8x8_Vector256(ref Block8x8F block) + { + DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation."); + + // First pass - process columns + FDCT8x8_1D_Vector256(ref block); + + // Second pass - process rows + block.TransposeInPlace(); + FDCT8x8_1D_Vector256(ref block); + + // Applies 1D floating point FDCT in place + static void FDCT8x8_1D_Vector256(ref Block8x8F block) + { + Vector256 tmp0 = block.V256_0 + block.V256_7; + Vector256 tmp7 = block.V256_0 - block.V256_7; + Vector256 tmp1 = block.V256_1 + block.V256_6; + Vector256 tmp6 = block.V256_1 - block.V256_6; + Vector256 tmp2 = block.V256_2 + block.V256_5; + Vector256 tmp5 = block.V256_2 - block.V256_5; + Vector256 tmp3 = block.V256_3 + block.V256_4; + Vector256 tmp4 = block.V256_3 - block.V256_4; + + // Even part + Vector256 tmp10 = tmp0 + tmp3; + Vector256 tmp13 = tmp0 - tmp3; + Vector256 tmp11 = tmp1 + tmp2; + Vector256 tmp12 = tmp1 - tmp2; + + block.V256_0 = tmp10 + tmp11; + block.V256_4 = tmp10 - tmp11; + + Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f); + Vector256 z1 = (tmp12 + tmp13) * mm256_F_0_7071; + block.V256_2 = tmp13 + z1; + block.V256_6 = tmp13 - z1; + + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + Vector256 z5 = (tmp10 - tmp12) * Vector256.Create(0.382683433f); // mm256_F_0_3826 + Vector256 z2 = Vector256_.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411 + Vector256 z4 = Vector256_.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065 + Vector256 z3 = tmp11 * mm256_F_0_7071; + + Vector256 z11 = tmp7 + z3; + Vector256 z13 = tmp7 - z3; + + block.V256_5 = z13 + z2; + block.V256_3 = z13 - z2; + block.V256_1 = z11 + z4; + block.V256_7 = z11 - z4; + } + } + + /// + /// Apply floating point IDCT in place using simd operations. + /// + /// Transposed input block. + private static void IDCT8x8_Vector256(ref Block8x8F transposedBlock) + { + DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation."); + + // First pass - process columns + IDCT8x8_1D_Vector256(ref transposedBlock); + + // Second pass - process rows + transposedBlock.TransposeInPlace(); + IDCT8x8_1D_Vector256(ref transposedBlock); + + // Applies 1D floating point FDCT in place + static void IDCT8x8_1D_Vector256(ref Block8x8F block) + { + // Even part + Vector256 tmp0 = block.V256_0; + Vector256 tmp1 = block.V256_2; + Vector256 tmp2 = block.V256_4; + Vector256 tmp3 = block.V256_6; + + Vector256 z5 = tmp0; + Vector256 tmp10 = z5 + tmp2; + Vector256 tmp11 = z5 - tmp2; + + Vector256 mm256_F_1_4142 = Vector256.Create(1.414213562f); + Vector256 tmp13 = tmp1 + tmp3; + Vector256 tmp12 = Vector256_.MultiplySubtract(tmp13, tmp1 - tmp3, mm256_F_1_4142); + + tmp0 = tmp10 + tmp13; + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + // Odd part + Vector256 tmp4 = block.V256_1; + Vector256 tmp5 = block.V256_3; + Vector256 tmp6 = block.V256_5; + Vector256 tmp7 = block.V256_7; + + Vector256 z13 = tmp6 + tmp5; + Vector256 z10 = tmp6 - tmp5; + Vector256 z11 = tmp4 + tmp7; + Vector256 z12 = tmp4 - tmp7; + + tmp7 = z11 + z13; + tmp11 = (z11 - z13) * mm256_F_1_4142; + + z5 = (z10 + z12) * Vector256.Create(1.847759065f); // mm256_F_1_8477 + + tmp10 = Vector256_.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823 + tmp12 = Vector256_.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131 + + tmp6 = tmp12 - tmp7; + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 - tmp5; + + block.V256_0 = tmp0 + tmp7; + block.V256_7 = tmp0 - tmp7; + block.V256_1 = tmp1 + tmp6; + block.V256_6 = tmp1 - tmp6; + block.V256_2 = tmp2 + tmp5; + block.V256_5 = tmp2 - tmp5; + block.V256_3 = tmp3 + tmp4; + block.V256_4 = tmp3 - tmp4; + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs index 4c22307cf..8122d8daa 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs @@ -4,7 +4,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Jpeg.Components; @@ -101,7 +101,7 @@ internal static partial class FloatingPointDCT } /// - /// Apply 2D floating point IDCT inplace. + /// Apply 2D floating point IDCT in place. /// /// /// Input block must be dequantized with quantization table @@ -110,9 +110,9 @@ internal static partial class FloatingPointDCT /// Input block. public static void TransformIDCT(ref Block8x8F block) { - if (Avx.IsSupported) + if (Vector256.IsHardwareAccelerated) { - IDCT8x8_Avx(ref block); + IDCT8x8_Vector256(ref block); } else { @@ -121,7 +121,7 @@ internal static partial class FloatingPointDCT } /// - /// Apply 2D floating point IDCT inplace. + /// Apply 2D floating point IDCT in place. /// /// /// Input block must be quantized after this method with quantization @@ -130,9 +130,9 @@ internal static partial class FloatingPointDCT /// Input block. public static void TransformFDCT(ref Block8x8F block) { - if (Avx.IsSupported) + if (Vector256.IsHardwareAccelerated) { - FDCT8x8_Avx(ref block); + FDCT8x8_Vector256(ref block); } else { diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs index b5d364dd3..cb8f52a96 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs @@ -271,7 +271,7 @@ public class Block8x8Tests : JpegFixture Block8x8 block8x8 = Block8x8.Load(Create8x8ShortData()); - block8x8.TransposeInplace(); + block8x8.TransposeInPlace(); short[] actual = new short[64]; block8x8.CopyTo(actual); diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs index 65d0a01ff..975378b5f 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs @@ -60,7 +60,7 @@ internal static partial class LibJpegTools internal void MakeBlock(Block8x8 block, int y, int x) { - block.TransposeInplace(); + block.TransposeInPlace(); this.MakeBlock(block.ToArray(), y, x); } From 505ecce3fa8d5c7cf8f967c7dd27f1f2f831fab0 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 12 May 2025 09:46:17 +1000 Subject: [PATCH 09/12] Update ShuffleNative (byte) --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 20 +++++++---- .../Common/Helpers/Vector128Utilities.cs | 35 +++++++++++-------- .../Common/Helpers/Vector256Utilities.cs | 2 +- .../Formats/Jpeg/Components/Block8x8F.cs | 1 - 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index e155e4536..dc610a6f9 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -113,7 +113,7 @@ internal static partial class SimdUtils [ConstantExpected] byte control) { if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) || - (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte) || + (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) || (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)) { int remainder = 0; @@ -158,7 +158,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight) { int remainder = source.Length % (Vector128.Count * 3); @@ -373,7 +373,7 @@ internal static partial class SimdUtils } } } - else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte) + else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) { Span temp = stackalloc byte[Vector256.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -445,7 +445,9 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign) + if (Vector128.IsHardwareAccelerated && + Vector128_.SupportsShuffleNativeByte && + Vector128_.SupportsAlignRight) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); @@ -505,7 +507,10 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && + Vector128_.SupportsShuffleNativeByte && + Vector128_.SupportsShiftByte && + Vector128_.SupportsAlignRight) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 fill = Vector128.Create(0xff000000ff000000ul).AsByte(); @@ -548,7 +553,10 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && + Vector128_.SupportsShuffleNativeByte && + Vector128_.SupportsShiftByte && + Vector128_.SupportsAlignRight) { Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); Vector128 maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 3471acbd3..83b842e13 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -4,6 +4,7 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.Wasm; @@ -38,13 +39,26 @@ internal static class Vector128_ public static bool SupportsShuffleNativeByte { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported; + get + { + if (Vector128.IsHardwareAccelerated) + { + if (RuntimeInformation.ProcessArchitecture is Architecture.X86 or Architecture.X64) + { + return Ssse3.IsSupported; + } + + return true; + } + + return false; + } } /// /// Gets a value indicating whether right align operations are supported. /// - public static bool SupportsRightAlign + public static bool SupportsAlignRight { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Ssse3.IsSupported || AdvSimd.IsSupported; @@ -91,23 +105,16 @@ internal static class Vector128_ [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 ShuffleNative(Vector128 vector, Vector128 indices) { + // For x64 we use the SSSE3 shuffle intrinsic to avoid additional instructions. 3 vs 1. if (Ssse3.IsSupported) { return Ssse3.Shuffle(vector, indices); } - if (AdvSimd.Arm64.IsSupported) - { - return AdvSimd.Arm64.VectorTableLookup(vector, indices); - } - - if (PackedSimd.IsSupported) - { - return PackedSimd.Swizzle(vector, indices); - } - - ThrowUnreachableException(); - return default; + // For ARM and WASM, codegen will be optimal. + // We don't throw for x86/x64 so we should never use this method without + // checking for support. + return Vector128.Shuffle(vector, indices); } /// diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index c835d267d..817d6e607 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -33,7 +33,7 @@ internal static class Vector256_ /// /// Gets a value indicating whether shuffle byte operations are supported. /// - public static bool SupportsShuffleByte + public static bool SupportsShuffleNativeByte { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Avx2.IsSupported; diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index a4a7d3ed0..49b519201 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -5,7 +5,6 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; using System.Text; using SixLabors.ImageSharp.Common.Helpers; From 55a8c732326b19416d58323dcb3660dea10b0687 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 12 May 2025 10:47:34 +1000 Subject: [PATCH 10/12] Expand v128 native shuffle (float) support --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 4 ++-- .../Common/Helpers/Vector128Utilities.cs | 19 ++++++++----------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index dc610a6f9..a1bf7dad3 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -68,7 +68,7 @@ internal static partial class SimdUtils { if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) || (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) || - (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat)) + Vector128.IsHardwareAccelerated) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -305,7 +305,7 @@ internal static partial class SimdUtils } } } - else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat) + else if (Vector128.IsHardwareAccelerated) { ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector128 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 83b842e13..322423e1a 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -24,15 +24,6 @@ namespace SixLabors.ImageSharp.Common.Helpers; internal static class Vector128_ #pragma warning restore SA1649 // File name should match first type name { - /// - /// Gets a value indicating whether shuffle operations are supported. - /// - public static bool SupportsShuffleNativeFloat - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Sse.IsSupported; - } - /// /// Gets a value indicating whether shuffle operations are supported. /// @@ -87,8 +78,14 @@ internal static class Vector128_ return Sse.Shuffle(vector, vector, control); } - ThrowUnreachableException(); - return default; + // Don't use InverseMMShuffle here as we want to avoid the cast. + Vector128 indices = Vector128.Create( + control & 0x3, + (control >> 2) & 0x3, + (control >> 4) & 0x3, + (control >> 6) & 0x3); + + return Vector128.Shuffle(vector, indices); } /// From a59c900e9f7743b50c7d9721f09e55faa6d66186 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 14 May 2025 20:55:57 +1000 Subject: [PATCH 11/12] More optimizations based on feedback --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 13 ++++++++++--- src/ImageSharp/Common/Helpers/Vector128Utilities.cs | 10 ++++++++++ .../Formats/Jpeg/Components/Block8x8F.Vector128.cs | 1 - 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index a1bf7dad3..4911653ce 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -1008,6 +1008,8 @@ internal static partial class SimdUtils ref Vector128 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); Vector128 scale = Vector128.Create((float)byte.MaxValue); + Vector128 min = Vector128.Zero; + Vector128 max = Vector128.Create((int)byte.MaxValue); for (nuint i = 0; i < n; i++) { @@ -1023,10 +1025,15 @@ internal static partial class SimdUtils Vector128 w2 = Vector128_.ConvertToInt32RoundToEven(f2); Vector128 w3 = Vector128_.ConvertToInt32RoundToEven(f3); - Vector128 u0 = Vector128_.PackSignedSaturate(w0, w1); - Vector128 u1 = Vector128_.PackSignedSaturate(w2, w3); + w0 = Vector128_.Clamp(w0, min, max); + w1 = Vector128_.Clamp(w1, min, max); + w2 = Vector128_.Clamp(w2, min, max); + w3 = Vector128_.Clamp(w3, min, max); - Unsafe.Add(ref destinationBase, i) = Vector128_.PackUnsignedSaturate(u0, u1); + Vector128 u0 = Vector128.Narrow(w0, w1); + Vector128 u1 = Vector128.Narrow(w2, w3); + + Unsafe.Add(ref destinationBase, i) = Vector128.Narrow(u0, u1).AsByte(); } } } diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 322423e1a..dbe0a1fce 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -205,6 +205,11 @@ internal static class Vector128_ return AdvSimd.ConvertToInt32RoundToEven(vector); } + if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertToInt32Saturate(PackedSimd.RoundToNearest(vector)); + } + Vector128 sign = vector & Vector128.Create(-0F); Vector128 val_2p23_f32 = sign | Vector128.Create(8388608F); @@ -230,6 +235,11 @@ internal static class Vector128_ return AdvSimd.RoundToNearest(vector); } + if (PackedSimd.IsSupported) + { + return PackedSimd.RoundToNearest(vector); + } + Vector128 sign = vector & Vector128.Create(-0F); Vector128 val_2p23_f32 = sign | Vector128.Create(8388608F); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs index 3daa47693..d4c0398d9 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs @@ -1,7 +1,6 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. -using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using SixLabors.ImageSharp.Common.Helpers; From 4c1ecfad49a0b76e1ad46a2a99222305e4096cd5 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 15 May 2025 09:16:43 +1000 Subject: [PATCH 12/12] Fix v128 narrowing. --- src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 4911653ce..96ddb7976 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -1030,10 +1030,10 @@ internal static partial class SimdUtils w2 = Vector128_.Clamp(w2, min, max); w3 = Vector128_.Clamp(w3, min, max); - Vector128 u0 = Vector128.Narrow(w0, w1); - Vector128 u1 = Vector128.Narrow(w2, w3); + Vector128 u0 = Vector128.Narrow(w0, w1).AsUInt16(); + Vector128 u1 = Vector128.Narrow(w2, w3).AsUInt16(); - Unsafe.Add(ref destinationBase, i) = Vector128.Narrow(u0, u1).AsByte(); + Unsafe.Add(ref destinationBase, i) = Vector128.Narrow(u0, u1); } } }