diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 2fe2f99ac6..a51c21b37f 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -14,7 +14,7 @@ namespace SixLabors.ImageSharp { public static class HwIntrinsics { - private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; /// /// as many elements as possible, slicing them down (keeping the remainder). diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index f68bca0412..f2a1c1e91e 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -4,7 +4,12 @@ using System; using System.Collections.Generic; using System.Numerics; - +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.Tuples; @@ -190,45 +195,90 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters /// public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) { - this.V0.X = r.A.X; - this.V0.Y = g.A.X; - this.V0.Z = b.A.X; - this.V0.W = 1f; - - this.V1.X = r.A.Y; - this.V1.Y = g.A.Y; - this.V1.Z = b.A.Y; - this.V1.W = 1f; - - this.V2.X = r.A.Z; - this.V2.Y = g.A.Z; - this.V2.Z = b.A.Z; - this.V2.W = 1f; - - this.V3.X = r.A.W; - this.V3.Y = g.A.W; - this.V3.Z = b.A.W; - this.V3.W = 1f; - - this.V4.X = r.B.X; - this.V4.Y = g.B.X; - this.V4.Z = b.B.X; - this.V4.W = 1f; - - this.V5.X = r.B.Y; - this.V5.Y = g.B.Y; - this.V5.Z = b.B.Y; - this.V5.W = 1f; - - this.V6.X = r.B.Z; - this.V6.Y = g.B.Z; - this.V6.Z = b.B.Z; - this.V6.W = 1f; - - this.V7.X = r.B.W; - this.V7.Y = g.B.W; - this.V7.Z = b.B.W; - this.V7.W = 1f; +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + Vector4 vo = Vector4.One; + Vector128 valpha = Unsafe.As>(ref vo); + + ref byte control = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskDeinterleave8x32); + Vector256 vcontrol = Unsafe.As>(ref control); + + Vector256 r0 = Avx.InsertVector128( + Unsafe.As>(ref r.A).ToVector256(), + Unsafe.As>(ref g.A), + 1); + + Vector256 r1 = Avx.InsertVector128( + Unsafe.As>(ref b.A).ToVector256(), + valpha, + 1); + + Vector256 r2 = Avx.InsertVector128( + Unsafe.As>(ref r.B).ToVector256(), + Unsafe.As>(ref g.B), + 1); + + Vector256 r3 = Avx.InsertVector128( + Unsafe.As>(ref b.B).ToVector256(), + valpha, + 1); + + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackHigh(r0, r1); + + Unsafe.As>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); + Unsafe.As>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); + + Vector256 t4 = Avx.UnpackLow(r2, r3); + Vector256 t6 = Avx.UnpackHigh(r2, r3); + + Unsafe.As>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); + Unsafe.As>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); + } + else +#endif + { + this.V0.X = r.A.X; + this.V0.Y = g.A.X; + this.V0.Z = b.A.X; + this.V0.W = 1f; + + this.V1.X = r.A.Y; + this.V1.Y = g.A.Y; + this.V1.Z = b.A.Y; + this.V1.W = 1f; + + this.V2.X = r.A.Z; + this.V2.Y = g.A.Z; + this.V2.Z = b.A.Z; + this.V2.W = 1f; + + this.V3.X = r.A.W; + this.V3.Y = g.A.W; + this.V3.Z = b.A.W; + this.V3.W = 1f; + + this.V4.X = r.B.X; + this.V4.Y = g.B.X; + this.V4.Z = b.B.X; + this.V4.W = 1f; + + this.V5.X = r.B.Y; + this.V5.Y = g.B.Y; + this.V5.Z = b.B.Y; + this.V5.W = 1f; + + this.V6.X = r.B.Z; + this.V6.Y = g.B.Z; + this.V6.Z = b.B.Z; + this.V6.W = 1f; + + this.V7.X = r.B.W; + this.V7.Y = g.B.W; + this.V7.Z = b.B.W; + this.V7.W = 1f; + } } } } diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs new file mode 100644 index 0000000000..a7ea771988 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs @@ -0,0 +1,40 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System.Numerics; +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Tuples; +using static SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters.JpegColorConverter; + +namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class Vector4OctetPack + { + private static Vector4Pair r = new Vector4Pair + { + A = new Vector4(1, 2, 3, 4), + B = new Vector4(5, 6, 7, 8) + }; + + private static Vector4Pair g = new Vector4Pair + { + A = new Vector4(9, 10, 11, 12), + B = new Vector4(13, 14, 15, 16) + }; + + private static Vector4Pair b = new Vector4Pair + { + A = new Vector4(17, 18, 19, 20), + B = new Vector4(21, 22, 23, 24) + }; + + [Benchmark] + public void Pack() + { + Vector4Octet v = default; + + v.Pack(ref r, ref g, ref b); + } + } +} diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs index e860c5491f..e8a06bf24e 100644 --- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs +++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs @@ -73,7 +73,9 @@ namespace SixLabors.ImageSharp.Benchmarks } #endif this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithEnvironmentVariables(new EnvironmentVariable(EnableHWIntrinsic, Off)) + .WithEnvironmentVariables( + new EnvironmentVariable(EnableHWIntrinsic, Off), + new EnvironmentVariable(FeatureSIMD, Off)) .WithId("No HwIntrinsics")); } }