diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index a51c21b37f..c5a7f5e909 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -16,6 +16,29 @@ namespace SixLabors.ImageSharp { public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + /// + /// Performs a multiplication and an addition of the . + /// + /// The vector to add to the intermediate result. + /// The first vector to multiply. + /// The second vector to multiply. + /// The . + [MethodImpl(InliningOptions.ShortMethod)] + public static Vector256 MultiplyAdd( + in Vector256 va, + in Vector256 vm0, + in Vector256 vm1) + { + if (Fma.IsSupported) + { + return Fma.MultiplyAdd(vm1, vm0, va); + } + else + { + return Avx.Add(Avx.Multiply(vm0, vm1), va); + } + } + /// /// as many elements as possible, slicing them down (keeping the remainder). /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs index c4d1408a2e..8c34baa1dc 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs @@ -1,11 +1,15 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. using System; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; - +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using static SixLabors.ImageSharp.SimdUtils; +#endif using SixLabors.ImageSharp.Tuples; // ReSharper disable ImpureMethodCallOnReadonlyValueField @@ -47,6 +51,71 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters "JpegColorConverter.FromYCbCrSimd256 can be used only on architecture having 256 byte floating point SIMD registers!"); } +#if SUPPORTS_RUNTIME_INTRINSICS + ref Vector256 yBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector256 cbBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector256 crBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + + ref Vector4Octet resultBase = + ref Unsafe.As(ref MemoryMarshal.GetReference(result)); + + // Used for the color conversion + var chromaOffset = Vector256.Create(-halfValue); + var scale = Vector256.Create(1 / maxValue); + var rCrMult = Vector256.Create(1.402F); + var gCbMult = Vector256.Create(0.344136F); + var gCrMult = Vector256.Create(0.714136F); + var bCbMult = Vector256.Create(1.772F); + + // Used for packing. + Vector4 vo = Vector4.One; + Vector128 valpha = Unsafe.As>(ref vo); + ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskDeinterleave8x32); + Vector256 vcontrol = Unsafe.As>(ref control); + + Vector4Pair rr = default; + Vector4Pair gg = default; + Vector4Pair bb = default; + + ref Vector256 rrRefAsVector = ref Unsafe.As>(ref rr); + ref Vector256 ggRefAsVector = ref Unsafe.As>(ref gg); + ref Vector256 bbRefAsVector = ref Unsafe.As>(ref bb); + + // Walking 8 elements at one step: + int n = result.Length / 8; + for (int i = 0; i < n; i++) + { + // y = yVals[i]; + // cb = cbVals[i] - 128F; + // cr = crVals[i] - 128F; + Vector256 y = Unsafe.Add(ref yBase, i); + Vector256 cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset); + Vector256 cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset); + + // r = y + (1.402F * cr); + // g = y - (0.344136F * cb) - (0.714136F * cr); + // b = y + (1.772F * cb); + // Adding & multiplying 8 elements at one time: + Vector256 r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult); + Vector256 g = Avx.Subtract(Avx.Subtract(y, Avx.Multiply(cb, gCbMult)), Avx.Multiply(cr, gCrMult)); + Vector256 b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult); + + r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale); + g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale); + b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale); + + rrRefAsVector = r; + ggRefAsVector = g; + bbRefAsVector = b; + + // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order: + ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); + destination.PackAvx2(ref rr, ref gg, ref bb, in valpha, in vcontrol); + } +#else ref Vector yBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); ref Vector cbBase = @@ -104,6 +173,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); destination.Pack(ref rr, ref gg, ref bb); } +#endif } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index f2a1c1e91e..4e96f3471d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -190,95 +190,97 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters #pragma warning disable SA1132 // Do not combine fields public Vector4 V0, V1, V2, V3, V4, V5, V6, V7; +#if SUPPORTS_RUNTIME_INTRINSICS + /// /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ... /// - public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) + [MethodImpl(InliningOptions.ShortMethod)] + public void PackAvx2( + ref Vector4Pair r, + ref Vector4Pair g, + ref Vector4Pair b, + in Vector128 a, + in Vector256 vcontrol) { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) - { - Vector4 vo = Vector4.One; - Vector128 valpha = Unsafe.As>(ref vo); + Vector256 r0 = Avx.InsertVector128( + Unsafe.As>(ref r.A), + Unsafe.As>(ref g.A), + 1); - ref byte control = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskDeinterleave8x32); - Vector256 vcontrol = Unsafe.As>(ref control); + Vector256 r1 = Avx.InsertVector128( + Unsafe.As>(ref b.A), + a, + 1); - Vector256 r0 = Avx.InsertVector128( - Unsafe.As>(ref r.A).ToVector256(), - Unsafe.As>(ref g.A), - 1); + Vector256 r2 = Avx.InsertVector128( + Unsafe.As>(ref r.B).ToVector256(), + Unsafe.As>(ref g.B), + 1); - Vector256 r1 = Avx.InsertVector128( - Unsafe.As>(ref b.A).ToVector256(), - valpha, - 1); + Vector256 r3 = Avx.InsertVector128( + Unsafe.As>(ref b.B).ToVector256(), + a, + 1); - Vector256 r2 = Avx.InsertVector128( - Unsafe.As>(ref r.B).ToVector256(), - Unsafe.As>(ref g.B), - 1); + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackHigh(r0, r1); - Vector256 r3 = Avx.InsertVector128( - Unsafe.As>(ref b.B).ToVector256(), - valpha, - 1); + Unsafe.As>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); + Unsafe.As>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); - Vector256 t0 = Avx.UnpackLow(r0, r1); - Vector256 t2 = Avx.UnpackHigh(r0, r1); + Vector256 t4 = Avx.UnpackLow(r2, r3); + Vector256 t6 = Avx.UnpackHigh(r2, r3); - Unsafe.As>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); - Unsafe.As>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); - - Vector256 t4 = Avx.UnpackLow(r2, r3); - Vector256 t6 = Avx.UnpackHigh(r2, r3); - - Unsafe.As>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); - Unsafe.As>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); - } - else + Unsafe.As>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); + Unsafe.As>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); + } #endif - { - this.V0.X = r.A.X; - this.V0.Y = g.A.X; - this.V0.Z = b.A.X; - this.V0.W = 1f; - - this.V1.X = r.A.Y; - this.V1.Y = g.A.Y; - this.V1.Z = b.A.Y; - this.V1.W = 1f; - - this.V2.X = r.A.Z; - this.V2.Y = g.A.Z; - this.V2.Z = b.A.Z; - this.V2.W = 1f; - - this.V3.X = r.A.W; - this.V3.Y = g.A.W; - this.V3.Z = b.A.W; - this.V3.W = 1f; - - this.V4.X = r.B.X; - this.V4.Y = g.B.X; - this.V4.Z = b.B.X; - this.V4.W = 1f; - - this.V5.X = r.B.Y; - this.V5.Y = g.B.Y; - this.V5.Z = b.B.Y; - this.V5.W = 1f; - - this.V6.X = r.B.Z; - this.V6.Y = g.B.Z; - this.V6.Z = b.B.Z; - this.V6.W = 1f; - - this.V7.X = r.B.W; - this.V7.Y = g.B.W; - this.V7.Z = b.B.W; - this.V7.W = 1f; - } + + /// + /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ... + /// + public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) + { + this.V0.X = r.A.X; + this.V0.Y = g.A.X; + this.V0.Z = b.A.X; + this.V0.W = 1f; + + this.V1.X = r.A.Y; + this.V1.Y = g.A.Y; + this.V1.Z = b.A.Y; + this.V1.W = 1f; + + this.V2.X = r.A.Z; + this.V2.Y = g.A.Z; + this.V2.Z = b.A.Z; + this.V2.W = 1f; + + this.V3.X = r.A.W; + this.V3.Y = g.A.W; + this.V3.Z = b.A.W; + this.V3.W = 1f; + + this.V4.X = r.B.X; + this.V4.Y = g.B.X; + this.V4.Z = b.B.X; + this.V4.W = 1f; + + this.V5.X = r.B.Y; + this.V5.Y = g.B.Y; + this.V5.Z = b.B.Y; + this.V5.W = 1f; + + this.V6.X = r.B.Z; + this.V6.Y = g.B.Z; + this.V6.Z = b.B.Z; + this.V6.W = 1f; + + this.V7.X = r.B.W; + this.V7.Y = g.B.W; + this.V7.Z = b.B.W; + this.V7.W = 1f; } } } diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs deleted file mode 100644 index a7ea771988..0000000000 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Apache License, Version 2.0. - -using System.Numerics; -using BenchmarkDotNet.Attributes; -using SixLabors.ImageSharp.Tuples; -using static SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters.JpegColorConverter; - -namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg -{ - [Config(typeof(Config.HwIntrinsics_SSE_AVX))] - public class Vector4OctetPack - { - private static Vector4Pair r = new Vector4Pair - { - A = new Vector4(1, 2, 3, 4), - B = new Vector4(5, 6, 7, 8) - }; - - private static Vector4Pair g = new Vector4Pair - { - A = new Vector4(9, 10, 11, 12), - B = new Vector4(13, 14, 15, 16) - }; - - private static Vector4Pair b = new Vector4Pair - { - A = new Vector4(17, 18, 19, 20), - B = new Vector4(21, 22, 23, 24) - }; - - [Benchmark] - public void Pack() - { - Vector4Octet v = default; - - v.Pack(ref r, ref g, ref b); - } - } -}