From f493aa4efa5ed84f1cd2c659e19bb5fa9177d82b Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 21 Oct 2020 00:00:46 +0100 Subject: [PATCH 01/12] Implement SimdUtils.HwIntrisics --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 276 ++++++++++++++++++ src/ImageSharp/Common/Helpers/SimdUtils.cs | 7 +- .../Color/Bulk/FromVector4.cs | 4 +- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 20 +- 4 files changed, 295 insertions(+), 12 deletions(-) create mode 100644 src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs new file mode 100644 index 0000000000..2fe2f99ac6 --- /dev/null +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -0,0 +1,276 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +#if SUPPORTS_RUNTIME_INTRINSICS +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + public static class HwIntrinsics + { + private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + + /// + /// as many elements as possible, slicing them down (keeping the remainder). + /// + [MethodImpl(InliningOptions.ShortMethod)] + internal static void ByteToNormalizedFloatReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); + + if (Avx2.IsSupported || Sse2.IsSupported) + { + int remainder; + if (Avx2.IsSupported) + { + remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count); + } + else + { + remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); + } + + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount)); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + } + + /// + /// Implementation , which is faster on new RyuJIT runtime. + /// + /// + /// Implementation is based on MagicScaler code: + /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182 + /// + internal static unsafe void ByteToNormalizedFloat( + ReadOnlySpan source, + Span dest) + { + if (Avx2.IsSupported) + { + VerifySpanInput(source, dest, Vector256.Count); + + int n = dest.Length / Vector256.Count; + + byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)); + + ref Vector256 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var scale = Vector256.Create(1 / (float)byte.MaxValue); + + for (int i = 0; i < n; i++) + { + int si = Vector256.Count * i; + Vector256 i0 = Avx2.ConvertToVector256Int32(sourceBase + si); + Vector256 i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256.Count); + Vector256 i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 2)); + Vector256 i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 3)); + + Vector256 f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0)); + Vector256 f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1)); + Vector256 f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2)); + Vector256 f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3)); + + ref Vector256 d = ref Unsafe.Add(ref destBase, i * 4); + + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; + } + } + else + { + // Sse + VerifySpanInput(source, dest, Vector128.Count); + + int n = dest.Length / Vector128.Count; + + byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)); + + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var scale = Vector128.Create(1 / (float)byte.MaxValue); + Vector128 zero = Vector128.Zero; + + for (int i = 0; i < n; i++) + { + int si = Vector128.Count * i; + + Vector128 i0, i1, i2, i3; + if (Sse41.IsSupported) + { + i0 = Sse41.ConvertToVector128Int32(sourceBase + si); + i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128.Count); + i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 2)); + i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 3)); + } + else + { + Vector128 b = Sse2.LoadVector128(sourceBase + si); + Vector128 s0 = Sse2.UnpackLow(b, zero).AsInt16(); + Vector128 s1 = Sse2.UnpackHigh(b, zero).AsInt16(); + + i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32(); + i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32(); + i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32(); + i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32(); + } + + Vector128 f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0)); + Vector128 f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1)); + Vector128 f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2)); + Vector128 f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3)); + + ref Vector128 d = ref Unsafe.Add(ref destBase, i * 4); + + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; + } + } + } + + /// + /// as many elements as possible, slicing them down (keeping the remainder). + /// + [MethodImpl(InliningOptions.ShortMethod)] + internal static void NormalizedFloatToByteSaturateReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); + + if (Avx2.IsSupported || Sse2.IsSupported) + { + int remainder; + if (Avx2.IsSupported) + { + remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count); + } + else + { + remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); + } + + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + NormalizedFloatToByteSaturate( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount)); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + } + + /// + /// Implementation of , which is faster on new .NET runtime. + /// + /// + /// Implementation is based on MagicScaler code: + /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622 + /// + internal static void NormalizedFloatToByteSaturate( + ReadOnlySpan source, + Span dest) + { + if (Avx2.IsSupported) + { + VerifySpanInput(source, dest, Vector256.Count); + + int n = dest.Length / Vector256.Count; + + ref Vector256 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector256 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var scale = Vector256.Create((float)byte.MaxValue); + ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); + Vector256 mask = Unsafe.As>(ref maskBase); + + for (int i = 0; i < n; i++) + { + ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4); + + Vector256 f0 = Avx.Multiply(scale, s); + Vector256 f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1)); + Vector256 f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2)); + Vector256 f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3)); + + Vector256 w0 = Avx.ConvertToVector256Int32(f0); + Vector256 w1 = Avx.ConvertToVector256Int32(f1); + Vector256 w2 = Avx.ConvertToVector256Int32(f2); + Vector256 w3 = Avx.ConvertToVector256Int32(f3); + + Vector256 u0 = Avx2.PackSignedSaturate(w0, w1); + Vector256 u1 = Avx2.PackSignedSaturate(w2, w3); + Vector256 b = Avx2.PackUnsignedSaturate(u0, u1); + b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); + + Unsafe.Add(ref destBase, i) = b; + } + } + else + { + // Sse + VerifySpanInput(source, dest, Vector128.Count); + + int n = dest.Length / Vector128.Count; + + ref Vector128 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var scale = Vector128.Create((float)byte.MaxValue); + + for (int i = 0; i < n; i++) + { + ref Vector128 s = ref Unsafe.Add(ref sourceBase, i * 4); + + Vector128 f0 = Sse.Multiply(scale, s); + Vector128 f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1)); + Vector128 f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2)); + Vector128 f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3)); + + Vector128 w0 = Sse2.ConvertToVector128Int32(f0); + Vector128 w1 = Sse2.ConvertToVector128Int32(f1); + Vector128 w2 = Sse2.ConvertToVector128Int32(f2); + Vector128 w3 = Sse2.ConvertToVector128Int32(f3); + + Vector128 u0 = Sse2.PackSignedSaturate(w0, w1); + Vector128 u1 = Sse2.PackSignedSaturate(w2, w3); + + Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1); + } + } + } + } + } +} +#endif diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index 7f917648dc..df533cedf1 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -79,8 +79,9 @@ namespace SixLabors.ImageSharp internal static void ByteToNormalizedFloat(ReadOnlySpan source, Span dest) { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); - -#if SUPPORTS_EXTENDED_INTRINSICS +#if SUPPORTS_RUNTIME_INTRINSICS + HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest); +#elif SUPPORTS_EXTENDED_INTRINSICS ExtendedIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest); #else BasicIntrinsics256.ByteToNormalizedFloatReduce(ref source, ref dest); @@ -110,7 +111,7 @@ namespace SixLabors.ImageSharp DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); #if SUPPORTS_RUNTIME_INTRINSICS - Avx2Intrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest); + HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest); #elif SUPPORTS_EXTENDED_INTRINSICS ExtendedIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest); #else diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs index da15da24c7..267bca4ad0 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs @@ -104,12 +104,12 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk #if SUPPORTS_RUNTIME_INTRINSICS [Benchmark] - public void UseAvx2() + public void UseHwIntrinsics() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - SimdUtils.Avx2Intrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats); + SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats); } private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 6dce489353..eca4e72cba 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -204,6 +204,17 @@ namespace SixLabors.ImageSharp.Tests.Common (s, d) => SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(s.Span, d.Span)); } +#if SUPPORTS_RUNTIME_INTRINSICS + [Theory] + [MemberData(nameof(ArraySizesDivisibleBy32))] + public void HwIntrinsics_BulkConvertByteToNormalizedFloat(int count) + { + TestImpl_BulkConvertByteToNormalizedFloat( + count, + (s, d) => SimdUtils.HwIntrinsics.ByteToNormalizedFloat(s.Span, d.Span)); + } +#endif + [Theory] [MemberData(nameof(ArbitraryArraySizes))] public void BulkConvertByteToNormalizedFloat(int count) @@ -281,16 +292,11 @@ namespace SixLabors.ImageSharp.Tests.Common [Theory] [MemberData(nameof(ArraySizesDivisibleBy32))] - public void Avx2_BulkConvertNormalizedFloatToByteClampOverflows(int count) + public void HwIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) { - if (!System.Runtime.Intrinsics.X86.Avx2.IsSupported) - { - return; - } - TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( count, - (s, d) => SimdUtils.Avx2Intrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span)); + (s, d) => SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span)); } #endif From 8e993394b44eb7a436311aa88c541fd30144cbdd Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 21 Oct 2020 14:49:05 +0100 Subject: [PATCH 02/12] Benchmarks, tests, and cleanup. --- .../Helpers/SimdUtils.Avx2Intrinsics.cs | 103 ------------------ .../Color/Bulk/FromVector4.cs | 4 +- .../Color/Bulk/ToVector4_Rgba32.cs | 13 ++- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 30 +++-- .../FeatureTesting/FeatureTestRunner.cs | 50 +++++++++ 5 files changed, 86 insertions(+), 114 deletions(-) delete mode 100644 src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs deleted file mode 100644 index b56c92dab7..0000000000 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Apache License, Version 2.0. - -#if SUPPORTS_RUNTIME_INTRINSICS - -using System; -using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; - -namespace SixLabors.ImageSharp -{ - internal static partial class SimdUtils - { - public static class Avx2Intrinsics - { - private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; - - /// - /// as many elements as possible, slicing them down (keeping the remainder). - /// - [MethodImpl(InliningOptions.ShortMethod)] - internal static void NormalizedFloatToByteSaturateReduce( - ref ReadOnlySpan source, - ref Span dest) - { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); - - if (Avx2.IsSupported) - { - int remainder = ImageMaths.ModuloP2(source.Length, Vector.Count); - int adjustedCount = source.Length - remainder; - - if (adjustedCount > 0) - { - NormalizedFloatToByteSaturate( - source.Slice(0, adjustedCount), - dest.Slice(0, adjustedCount)); - - source = source.Slice(adjustedCount); - dest = dest.Slice(adjustedCount); - } - } - } - - /// - /// Implementation of , which is faster on new .NET runtime. - /// - /// - /// Implementation is based on MagicScaler code: - /// https://github.com/saucecontrol/PhotoSauce/blob/a9bd6e5162d2160419f0cf743fd4f536c079170b/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L453-L477 - /// - internal static void NormalizedFloatToByteSaturate( - ReadOnlySpan source, - Span dest) - { - VerifySpanInput(source, dest, Vector256.Count); - - int n = dest.Length / Vector256.Count; - - ref Vector256 sourceBase = - ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); - ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - - var maxBytes = Vector256.Create(255f); - ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); - Vector256 mask = Unsafe.As>(ref maskBase); - - for (int i = 0; i < n; i++) - { - ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4); - - Vector256 f0 = s; - Vector256 f1 = Unsafe.Add(ref s, 1); - Vector256 f2 = Unsafe.Add(ref s, 2); - Vector256 f3 = Unsafe.Add(ref s, 3); - - Vector256 w0 = ConvertToInt32(f0, maxBytes); - Vector256 w1 = ConvertToInt32(f1, maxBytes); - Vector256 w2 = ConvertToInt32(f2, maxBytes); - Vector256 w3 = ConvertToInt32(f3, maxBytes); - - Vector256 u0 = Avx2.PackSignedSaturate(w0, w1); - Vector256 u1 = Avx2.PackSignedSaturate(w2, w3); - Vector256 b = Avx2.PackUnsignedSaturate(u0, u1); - b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); - - Unsafe.Add(ref destBase, i) = b; - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 ConvertToInt32(Vector256 vf, Vector256 scale) - { - vf = Avx.Multiply(vf, scale); - return Avx.ConvertToVector256Int32(vf); - } - } - } -} -#endif diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs index 267bca4ad0..dc030e07a7 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs @@ -13,15 +13,13 @@ using System.Runtime.Intrinsics.X86; #endif using BenchmarkDotNet.Attributes; -using BenchmarkDotNet.Environments; -using BenchmarkDotNet.Jobs; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk { - [Config(typeof(Config.ShortClr))] + [Config(typeof(Config.ShortCore31))] public abstract class FromVector4 where TPixel : unmanaged, IPixel { diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgba32.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgba32.cs index 145bf9889b..9ae3b073d4 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgba32.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgba32.cs @@ -13,7 +13,7 @@ using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk { - [Config(typeof(Config.ShortClr))] + [Config(typeof(Config.ShortCore31))] public class ToVector4_Rgba32 : ToVector4 { [Benchmark] @@ -52,6 +52,17 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(sBytes, dFloats); } +#if SUPPORTS_RUNTIME_INTRINSICS + [Benchmark] + public void HwIntrinsics() + { + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + + SimdUtils.HwIntrinsics.ByteToNormalizedFloat(sBytes, dFloats); + } +#endif + // [Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops() { diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index eca4e72cba..838db742a1 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -7,7 +7,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using SixLabors.ImageSharp.Common.Tuples; - +using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; using Xunit.Abstractions; @@ -209,9 +209,17 @@ namespace SixLabors.ImageSharp.Tests.Common [MemberData(nameof(ArraySizesDivisibleBy32))] public void HwIntrinsics_BulkConvertByteToNormalizedFloat(int count) { - TestImpl_BulkConvertByteToNormalizedFloat( - count, - (s, d) => SimdUtils.HwIntrinsics.ByteToNormalizedFloat(s.Span, d.Span)); + static void RunTest(string serialized) + { + TestImpl_BulkConvertByteToNormalizedFloat( + FeatureTestRunner.Deserialize(serialized), + (s, d) => SimdUtils.HwIntrinsics.ByteToNormalizedFloat(s.Span, d.Span)); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41, + count); } #endif @@ -294,9 +302,17 @@ namespace SixLabors.ImageSharp.Tests.Common [MemberData(nameof(ArraySizesDivisibleBy32))] public void HwIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) { - TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( - count, - (s, d) => SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span)); + static void RunTest(string serialized) + { + TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( + FeatureTestRunner.Deserialize(serialized), + (s, d) => SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span)); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2, + count); } #endif diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs index eb1714baad..fdba9ce982 100644 --- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs +++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs @@ -33,6 +33,14 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities where T : IXunitSerializable => BasicSerializer.Deserialize(value); + /// + /// Allows the deserialization of integers passed to the feature test. + /// + /// The string value to deserialize. + /// The value. + public static int Deserialize(string value) + => Convert.ToInt32(value); + /// /// Runs the given test within an environment /// where the given features. @@ -201,6 +209,48 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities } } + /// + /// Runs the given test within an environment + /// where the given features. + /// + /// The test action to run. + /// The intrinsics features. + /// The value to pass as a parameter to the test action. + public static void RunWithHwIntrinsicsFeature( + Action action, + HwIntrinsics intrinsics, + int serializable) + { + if (!RemoteExecutor.IsSupported) + { + return; + } + + foreach (KeyValuePair intrinsic in intrinsics.ToFeatureKeyValueCollection()) + { + var processStartInfo = new ProcessStartInfo(); + if (intrinsic.Key != HwIntrinsics.AllowAll) + { + processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0"; + + RemoteExecutor.Invoke( + action, + serializable.ToString(), + new RemoteInvokeOptions + { + StartInfo = processStartInfo + }) + .Dispose(); + } + else + { + // Since we are running using the default architecture there is no + // point creating the overhead of running the action in a separate process. + action(serializable.ToString()); + } + } + } + internal static Dictionary ToFeatureKeyValueCollection(this HwIntrinsics intrinsics) { // Loop through and translate the given values into COMPlus equivaluents From aecf80388cd4f8a33d709d2ff1f359de9cfa8319 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 21 Oct 2020 17:59:57 +0100 Subject: [PATCH 03/12] Add Avx2 Span Premultiplication and Reverse --- src/ImageSharp/Common/Helpers/ImageMaths.cs | 6 ++ .../Common/Helpers/Vector4Utilities.cs | 80 ++++++++++++++++--- .../Color/Bulk/PremultiplyVector4.cs | 68 ++++++++++++++++ .../Color/Bulk/UnPremultiplyVector4.cs | 68 ++++++++++++++++ .../Helpers/ImageMathsTests.cs | 15 ++++ .../Helpers/Vector4UtilsTests.cs | 2 + 6 files changed, 229 insertions(+), 10 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs create mode 100644 tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs index 977432f8bb..d24230fe18 100644 --- a/src/ImageSharp/Common/Helpers/ImageMaths.cs +++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs @@ -132,6 +132,12 @@ namespace SixLabors.ImageSharp return (a / GreatestCommonDivisor(a, b)) * b; } + /// + /// Calculates % 2 + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static int Modulo2(int x) => x & 1; + /// /// Calculates % 4 /// diff --git a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs index fccc50755d..848a917912 100644 --- a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs @@ -5,6 +5,10 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp { @@ -13,6 +17,10 @@ namespace SixLabors.ImageSharp /// internal static class Vector4Utilities { + private const int BlendAlphaControl = 0b10001000; + + private static ReadOnlySpan PermuteAlphaMask8x32 => new byte[] { 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0 }; + /// /// Restricts a vector between a minimum and a maximum value. /// 5x Faster then . @@ -56,13 +64,39 @@ namespace SixLabors.ImageSharp [MethodImpl(InliningOptions.ShortMethod)] public static void Premultiply(Span vectors) { - // TODO: This method can be AVX2 optimized using Vector - ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported && vectors.Length >= 2) + { + ref Vector256 vectorsBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); - for (int i = 0; i < vectors.Length; i++) + Vector256 mask = + Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); + + int n = (vectors.Length * 4) / Vector256.Count; + for (int i = 0; i < n; i++) + { + ref Vector256 source = ref Unsafe.Add(ref vectorsBase, i); + Vector256 multiply = Avx2.PermuteVar8x32(source, mask); + source = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl); + } + + if (ImageMaths.Modulo2(vectors.Length) != 0) + { + // Vector4 fits neatly in pairs. Any overlap has to be equal to 1. + Premultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1))); + } + } + else +#endif { - ref Vector4 v = ref Unsafe.Add(ref baseRef, i); - Premultiply(ref v); + ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); + + for (int i = 0; i < vectors.Length; i++) + { + ref Vector4 v = ref Unsafe.Add(ref baseRef, i); + Premultiply(ref v); + } } } @@ -73,13 +107,39 @@ namespace SixLabors.ImageSharp [MethodImpl(InliningOptions.ShortMethod)] public static void UnPremultiply(Span vectors) { - // TODO: This method can be AVX2 optimized using Vector - ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported && vectors.Length >= 2) + { + ref Vector256 vectorsBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); - for (int i = 0; i < vectors.Length; i++) + Vector256 mask = + Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); + + int n = (vectors.Length * 4) / Vector256.Count; + for (int i = 0; i < n; i++) + { + ref Vector256 source = ref Unsafe.Add(ref vectorsBase, i); + Vector256 multiply = Avx2.PermuteVar8x32(source, mask); + source = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl); + } + + if (ImageMaths.Modulo2(vectors.Length) != 0) + { + // Vector4 fits neatly in pairs. Any overlap has to be equal to 1. + UnPremultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1))); + } + } + else +#endif { - ref Vector4 v = ref Unsafe.Add(ref baseRef, i); - UnPremultiply(ref v); + ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); + + for (int i = 0; i < vectors.Length; i++) + { + ref Vector4 v = ref Unsafe.Add(ref baseRef, i); + UnPremultiply(ref v); + } } } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs new file mode 100644 index 0000000000..2a886c6879 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs @@ -0,0 +1,68 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.ShortCore31))] + public class PremultiplyVector4 + { + private static readonly Vector4[] Vectors = CreateVectors(); + + [Benchmark(Baseline = true)] + public void PremultiplyBaseline() + { + ref Vector4 baseRef = ref MemoryMarshal.GetReference(Vectors); + + for (int i = 0; i < Vectors.Length; i++) + { + ref Vector4 v = ref Unsafe.Add(ref baseRef, i); + Premultiply(ref v); + } + } + + [Benchmark] + public void Premultiply() + { + Vector4Utilities.Premultiply(Vectors); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void Premultiply(ref Vector4 source) + { + float w = source.W; + source *= w; + source.W = w; + } + + private static Vector4[] CreateVectors() + { + var rnd = new Random(42); + return GenerateRandomVectorArray(rnd, 2048, 0, 1); + } + + private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal) + { + var values = new Vector4[length]; + + for (int i = 0; i < length; i++) + { + ref Vector4 v = ref values[i]; + v.X = GetRandomFloat(rnd, minVal, maxVal); + v.Y = GetRandomFloat(rnd, minVal, maxVal); + v.Z = GetRandomFloat(rnd, minVal, maxVal); + v.W = GetRandomFloat(rnd, minVal, maxVal); + } + + return values; + } + + private static float GetRandomFloat(Random rnd, float minVal, float maxVal) + => ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal; + } +} diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs new file mode 100644 index 0000000000..89e055da46 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs @@ -0,0 +1,68 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.ShortCore31))] + public class UnPremultiplyVector4 + { + private static readonly Vector4[] Vectors = CreateVectors(); + + [Benchmark(Baseline = true)] + public void UnPremultiplyBaseline() + { + ref Vector4 baseRef = ref MemoryMarshal.GetReference(Vectors); + + for (int i = 0; i < Vectors.Length; i++) + { + ref Vector4 v = ref Unsafe.Add(ref baseRef, i); + UnPremultiply(ref v); + } + } + + [Benchmark] + public void UnPremultiply() + { + Vector4Utilities.UnPremultiply(Vectors); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void UnPremultiply(ref Vector4 source) + { + float w = source.W; + source *= w; + source.W = w; + } + + private static Vector4[] CreateVectors() + { + var rnd = new Random(42); + return GenerateRandomVectorArray(rnd, 2048, 0, 1); + } + + private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal) + { + var values = new Vector4[length]; + + for (int i = 0; i < length; i++) + { + ref Vector4 v = ref values[i]; + v.X = GetRandomFloat(rnd, minVal, maxVal); + v.Y = GetRandomFloat(rnd, minVal, maxVal); + v.Z = GetRandomFloat(rnd, minVal, maxVal); + v.W = GetRandomFloat(rnd, minVal, maxVal); + } + + return values; + } + + private static float GetRandomFloat(Random rnd, float minVal, float maxVal) + => ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal; + } +} diff --git a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs index 27689f6813..7d16623877 100644 --- a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs +++ b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs @@ -10,6 +10,21 @@ namespace SixLabors.ImageSharp.Tests.Helpers { public class ImageMathsTests { + [Theory] + [InlineData(0)] + [InlineData(1)] + [InlineData(2)] + [InlineData(3)] + [InlineData(4)] + [InlineData(100)] + [InlineData(123)] + [InlineData(53436353)] + public void Modulo2(int x) + { + int actual = ImageMaths.Modulo2(x); + Assert.Equal(x % 2, actual); + } + [Theory] [InlineData(0)] [InlineData(1)] diff --git a/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs b/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs index c3b8e79ee2..2bb43c440b 100644 --- a/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs +++ b/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs @@ -17,6 +17,7 @@ namespace SixLabors.ImageSharp.Tests.Helpers [InlineData(0)] [InlineData(1)] [InlineData(30)] + [InlineData(63)] public void Premultiply_VectorSpan(int length) { var rnd = new Random(42); @@ -36,6 +37,7 @@ namespace SixLabors.ImageSharp.Tests.Helpers [InlineData(0)] [InlineData(1)] [InlineData(30)] + [InlineData(63)] public void UnPremultiply_VectorSpan(int length) { var rnd = new Random(42); From 1067acbe4c57ba7fc601186ad58b1087380a8a69 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 21 Oct 2020 22:22:47 +0100 Subject: [PATCH 04/12] Use Tanner's updated code. --- .../Common/Helpers/Vector4Utilities.cs | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs index 848a917912..5ae7ac1b71 100644 --- a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs @@ -61,7 +61,7 @@ namespace SixLabors.ImageSharp /// Bulk variant of /// /// The span of vectors - [MethodImpl(InliningOptions.ShortMethod)] + [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] public static void Premultiply(Span vectors) { #if SUPPORTS_RUNTIME_INTRINSICS @@ -73,12 +73,15 @@ namespace SixLabors.ImageSharp Vector256 mask = Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); - int n = (vectors.Length * 4) / Vector256.Count; - for (int i = 0; i < n; i++) + // Divide by 2 as 4 elements per Vector4 and 8 per Vector256 + ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u)); + + while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast)) { - ref Vector256 source = ref Unsafe.Add(ref vectorsBase, i); + Vector256 source = vectorsBase; Vector256 multiply = Avx2.PermuteVar8x32(source, mask); - source = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl); + vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl); + vectorsBase = ref Unsafe.Add(ref vectorsBase, 1); } if (ImageMaths.Modulo2(vectors.Length) != 0) @@ -104,7 +107,7 @@ namespace SixLabors.ImageSharp /// Bulk variant of /// /// The span of vectors - [MethodImpl(InliningOptions.ShortMethod)] + [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] public static void UnPremultiply(Span vectors) { #if SUPPORTS_RUNTIME_INTRINSICS @@ -116,12 +119,15 @@ namespace SixLabors.ImageSharp Vector256 mask = Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); - int n = (vectors.Length * 4) / Vector256.Count; - for (int i = 0; i < n; i++) + // Divide by 2 as 4 elements per Vector4 and 8 per Vector256 + ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u)); + + while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast)) { - ref Vector256 source = ref Unsafe.Add(ref vectorsBase, i); + Vector256 source = vectorsBase; Vector256 multiply = Avx2.PermuteVar8x32(source, mask); - source = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl); + vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl); + vectorsBase = ref Unsafe.Add(ref vectorsBase, 1); } if (ImageMaths.Modulo2(vectors.Length) != 0) From d4e0bdd7b7949072c6bc47e07301fce8ab5a96af Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 21 Oct 2020 23:59:37 +0100 Subject: [PATCH 05/12] Remove hotpath attr --- src/ImageSharp/Common/Helpers/Vector4Utilities.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs index 5ae7ac1b71..0137d02568 100644 --- a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs @@ -61,7 +61,7 @@ namespace SixLabors.ImageSharp /// Bulk variant of /// /// The span of vectors - [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] + [MethodImpl(InliningOptions.ShortMethod)] public static void Premultiply(Span vectors) { #if SUPPORTS_RUNTIME_INTRINSICS @@ -107,7 +107,7 @@ namespace SixLabors.ImageSharp /// Bulk variant of /// /// The span of vectors - [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] + [MethodImpl(InliningOptions.ShortMethod)] public static void UnPremultiply(Span vectors) { #if SUPPORTS_RUNTIME_INTRINSICS From e3faadbf2edac8a51d09bf593088f42a073bd60b Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 22 Oct 2020 10:34:42 +0100 Subject: [PATCH 06/12] Use Avx.Shuffle for lower latency --- src/ImageSharp/Common/Helpers/Vector4Utilities.cs | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs index 0137d02568..f617e9a3ea 100644 --- a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs @@ -17,9 +17,8 @@ namespace SixLabors.ImageSharp /// internal static class Vector4Utilities { - private const int BlendAlphaControl = 0b10001000; - - private static ReadOnlySpan PermuteAlphaMask8x32 => new byte[] { 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0 }; + private const int BlendAlphaControl = 0b_10_00_10_00; + private const int ShuffleAlphaControl = 0b_11_11_11_11; /// /// Restricts a vector between a minimum and a maximum value. @@ -70,16 +69,13 @@ namespace SixLabors.ImageSharp ref Vector256 vectorsBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); - Vector256 mask = - Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); - // Divide by 2 as 4 elements per Vector4 and 8 per Vector256 ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u)); while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast)) { Vector256 source = vectorsBase; - Vector256 multiply = Avx2.PermuteVar8x32(source, mask); + Vector256 multiply = Avx.Shuffle(source, source, ShuffleAlphaControl); vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl); vectorsBase = ref Unsafe.Add(ref vectorsBase, 1); } @@ -116,16 +112,13 @@ namespace SixLabors.ImageSharp ref Vector256 vectorsBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); - Vector256 mask = - Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); - // Divide by 2 as 4 elements per Vector4 and 8 per Vector256 ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u)); while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast)) { Vector256 source = vectorsBase; - Vector256 multiply = Avx2.PermuteVar8x32(source, mask); + Vector256 multiply = Avx.Shuffle(source, source, ShuffleAlphaControl); vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl); vectorsBase = ref Unsafe.Add(ref vectorsBase, 1); } From 05b66da9f79a8faba536d3614469d7b477e93eaa Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 22 Oct 2020 10:53:14 +0100 Subject: [PATCH 07/12] Fix base unpremultiply benchmark --- tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs index 89e055da46..1312c767be 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs @@ -36,7 +36,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk private static void UnPremultiply(ref Vector4 source) { float w = source.W; - source *= w; + source /= w; source.W = w; } From 9629f1c16e87e1a960e9e635f1595ba2af7dae02 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 23 Oct 2020 14:03:33 +0100 Subject: [PATCH 08/12] Add AVX2 implementation --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 2 +- .../ColorConverters/JpegColorConverter.cs | 130 ++++++++++++------ .../Codecs/Jpeg/Vector4OctetPack.cs | 40 ++++++ .../Config.HwIntrinsics.cs | 4 +- 4 files changed, 134 insertions(+), 42 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 2fe2f99ac6..a51c21b37f 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -14,7 +14,7 @@ namespace SixLabors.ImageSharp { public static class HwIntrinsics { - private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; /// /// as many elements as possible, slicing them down (keeping the remainder). diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index f68bca0412..f2a1c1e91e 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -4,7 +4,12 @@ using System; using System.Collections.Generic; using System.Numerics; - +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.Tuples; @@ -190,45 +195,90 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters /// public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) { - this.V0.X = r.A.X; - this.V0.Y = g.A.X; - this.V0.Z = b.A.X; - this.V0.W = 1f; - - this.V1.X = r.A.Y; - this.V1.Y = g.A.Y; - this.V1.Z = b.A.Y; - this.V1.W = 1f; - - this.V2.X = r.A.Z; - this.V2.Y = g.A.Z; - this.V2.Z = b.A.Z; - this.V2.W = 1f; - - this.V3.X = r.A.W; - this.V3.Y = g.A.W; - this.V3.Z = b.A.W; - this.V3.W = 1f; - - this.V4.X = r.B.X; - this.V4.Y = g.B.X; - this.V4.Z = b.B.X; - this.V4.W = 1f; - - this.V5.X = r.B.Y; - this.V5.Y = g.B.Y; - this.V5.Z = b.B.Y; - this.V5.W = 1f; - - this.V6.X = r.B.Z; - this.V6.Y = g.B.Z; - this.V6.Z = b.B.Z; - this.V6.W = 1f; - - this.V7.X = r.B.W; - this.V7.Y = g.B.W; - this.V7.Z = b.B.W; - this.V7.W = 1f; +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + Vector4 vo = Vector4.One; + Vector128 valpha = Unsafe.As>(ref vo); + + ref byte control = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskDeinterleave8x32); + Vector256 vcontrol = Unsafe.As>(ref control); + + Vector256 r0 = Avx.InsertVector128( + Unsafe.As>(ref r.A).ToVector256(), + Unsafe.As>(ref g.A), + 1); + + Vector256 r1 = Avx.InsertVector128( + Unsafe.As>(ref b.A).ToVector256(), + valpha, + 1); + + Vector256 r2 = Avx.InsertVector128( + Unsafe.As>(ref r.B).ToVector256(), + Unsafe.As>(ref g.B), + 1); + + Vector256 r3 = Avx.InsertVector128( + Unsafe.As>(ref b.B).ToVector256(), + valpha, + 1); + + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackHigh(r0, r1); + + Unsafe.As>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); + Unsafe.As>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); + + Vector256 t4 = Avx.UnpackLow(r2, r3); + Vector256 t6 = Avx.UnpackHigh(r2, r3); + + Unsafe.As>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); + Unsafe.As>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); + } + else +#endif + { + this.V0.X = r.A.X; + this.V0.Y = g.A.X; + this.V0.Z = b.A.X; + this.V0.W = 1f; + + this.V1.X = r.A.Y; + this.V1.Y = g.A.Y; + this.V1.Z = b.A.Y; + this.V1.W = 1f; + + this.V2.X = r.A.Z; + this.V2.Y = g.A.Z; + this.V2.Z = b.A.Z; + this.V2.W = 1f; + + this.V3.X = r.A.W; + this.V3.Y = g.A.W; + this.V3.Z = b.A.W; + this.V3.W = 1f; + + this.V4.X = r.B.X; + this.V4.Y = g.B.X; + this.V4.Z = b.B.X; + this.V4.W = 1f; + + this.V5.X = r.B.Y; + this.V5.Y = g.B.Y; + this.V5.Z = b.B.Y; + this.V5.W = 1f; + + this.V6.X = r.B.Z; + this.V6.Y = g.B.Z; + this.V6.Z = b.B.Z; + this.V6.W = 1f; + + this.V7.X = r.B.W; + this.V7.Y = g.B.W; + this.V7.Z = b.B.W; + this.V7.W = 1f; + } } } } diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs new file mode 100644 index 0000000000..a7ea771988 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs @@ -0,0 +1,40 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System.Numerics; +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Tuples; +using static SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters.JpegColorConverter; + +namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class Vector4OctetPack + { + private static Vector4Pair r = new Vector4Pair + { + A = new Vector4(1, 2, 3, 4), + B = new Vector4(5, 6, 7, 8) + }; + + private static Vector4Pair g = new Vector4Pair + { + A = new Vector4(9, 10, 11, 12), + B = new Vector4(13, 14, 15, 16) + }; + + private static Vector4Pair b = new Vector4Pair + { + A = new Vector4(17, 18, 19, 20), + B = new Vector4(21, 22, 23, 24) + }; + + [Benchmark] + public void Pack() + { + Vector4Octet v = default; + + v.Pack(ref r, ref g, ref b); + } + } +} diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs index e860c5491f..e8a06bf24e 100644 --- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs +++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs @@ -73,7 +73,9 @@ namespace SixLabors.ImageSharp.Benchmarks } #endif this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithEnvironmentVariables(new EnvironmentVariable(EnableHWIntrinsic, Off)) + .WithEnvironmentVariables( + new EnvironmentVariable(EnableHWIntrinsic, Off), + new EnvironmentVariable(FeatureSIMD, Off)) .WithId("No HwIntrinsics")); } } From b8081fd3e94e2b338a9e12e9a0d859f0d9f785d6 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 23 Oct 2020 16:54:10 +0100 Subject: [PATCH 09/12] Use HW color conversion --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 23 +++ .../JpegColorConverter.FromYCbCrSimdAvx2.cs | 74 ++++++++- .../ColorConverters/JpegColorConverter.cs | 156 +++++++++--------- .../Codecs/Jpeg/Vector4OctetPack.cs | 40 ----- 4 files changed, 174 insertions(+), 119 deletions(-) delete mode 100644 tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index a51c21b37f..c5a7f5e909 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -16,6 +16,29 @@ namespace SixLabors.ImageSharp { public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + /// + /// Performs a multiplication and an addition of the . + /// + /// The vector to add to the intermediate result. + /// The first vector to multiply. + /// The second vector to multiply. + /// The . + [MethodImpl(InliningOptions.ShortMethod)] + public static Vector256 MultiplyAdd( + in Vector256 va, + in Vector256 vm0, + in Vector256 vm1) + { + if (Fma.IsSupported) + { + return Fma.MultiplyAdd(vm1, vm0, va); + } + else + { + return Avx.Add(Avx.Multiply(vm0, vm1), va); + } + } + /// /// as many elements as possible, slicing them down (keeping the remainder). /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs index c4d1408a2e..8c34baa1dc 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs @@ -1,11 +1,15 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. using System; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; - +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using static SixLabors.ImageSharp.SimdUtils; +#endif using SixLabors.ImageSharp.Tuples; // ReSharper disable ImpureMethodCallOnReadonlyValueField @@ -47,6 +51,71 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters "JpegColorConverter.FromYCbCrSimd256 can be used only on architecture having 256 byte floating point SIMD registers!"); } +#if SUPPORTS_RUNTIME_INTRINSICS + ref Vector256 yBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector256 cbBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector256 crBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + + ref Vector4Octet resultBase = + ref Unsafe.As(ref MemoryMarshal.GetReference(result)); + + // Used for the color conversion + var chromaOffset = Vector256.Create(-halfValue); + var scale = Vector256.Create(1 / maxValue); + var rCrMult = Vector256.Create(1.402F); + var gCbMult = Vector256.Create(0.344136F); + var gCrMult = Vector256.Create(0.714136F); + var bCbMult = Vector256.Create(1.772F); + + // Used for packing. + Vector4 vo = Vector4.One; + Vector128 valpha = Unsafe.As>(ref vo); + ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskDeinterleave8x32); + Vector256 vcontrol = Unsafe.As>(ref control); + + Vector4Pair rr = default; + Vector4Pair gg = default; + Vector4Pair bb = default; + + ref Vector256 rrRefAsVector = ref Unsafe.As>(ref rr); + ref Vector256 ggRefAsVector = ref Unsafe.As>(ref gg); + ref Vector256 bbRefAsVector = ref Unsafe.As>(ref bb); + + // Walking 8 elements at one step: + int n = result.Length / 8; + for (int i = 0; i < n; i++) + { + // y = yVals[i]; + // cb = cbVals[i] - 128F; + // cr = crVals[i] - 128F; + Vector256 y = Unsafe.Add(ref yBase, i); + Vector256 cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset); + Vector256 cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset); + + // r = y + (1.402F * cr); + // g = y - (0.344136F * cb) - (0.714136F * cr); + // b = y + (1.772F * cb); + // Adding & multiplying 8 elements at one time: + Vector256 r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult); + Vector256 g = Avx.Subtract(Avx.Subtract(y, Avx.Multiply(cb, gCbMult)), Avx.Multiply(cr, gCrMult)); + Vector256 b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult); + + r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale); + g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale); + b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale); + + rrRefAsVector = r; + ggRefAsVector = g; + bbRefAsVector = b; + + // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order: + ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); + destination.PackAvx2(ref rr, ref gg, ref bb, in valpha, in vcontrol); + } +#else ref Vector yBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); ref Vector cbBase = @@ -104,6 +173,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); destination.Pack(ref rr, ref gg, ref bb); } +#endif } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index f2a1c1e91e..4e96f3471d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -190,95 +190,97 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters #pragma warning disable SA1132 // Do not combine fields public Vector4 V0, V1, V2, V3, V4, V5, V6, V7; +#if SUPPORTS_RUNTIME_INTRINSICS + /// /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ... /// - public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) + [MethodImpl(InliningOptions.ShortMethod)] + public void PackAvx2( + ref Vector4Pair r, + ref Vector4Pair g, + ref Vector4Pair b, + in Vector128 a, + in Vector256 vcontrol) { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) - { - Vector4 vo = Vector4.One; - Vector128 valpha = Unsafe.As>(ref vo); + Vector256 r0 = Avx.InsertVector128( + Unsafe.As>(ref r.A), + Unsafe.As>(ref g.A), + 1); - ref byte control = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskDeinterleave8x32); - Vector256 vcontrol = Unsafe.As>(ref control); + Vector256 r1 = Avx.InsertVector128( + Unsafe.As>(ref b.A), + a, + 1); - Vector256 r0 = Avx.InsertVector128( - Unsafe.As>(ref r.A).ToVector256(), - Unsafe.As>(ref g.A), - 1); + Vector256 r2 = Avx.InsertVector128( + Unsafe.As>(ref r.B).ToVector256(), + Unsafe.As>(ref g.B), + 1); - Vector256 r1 = Avx.InsertVector128( - Unsafe.As>(ref b.A).ToVector256(), - valpha, - 1); + Vector256 r3 = Avx.InsertVector128( + Unsafe.As>(ref b.B).ToVector256(), + a, + 1); - Vector256 r2 = Avx.InsertVector128( - Unsafe.As>(ref r.B).ToVector256(), - Unsafe.As>(ref g.B), - 1); + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackHigh(r0, r1); - Vector256 r3 = Avx.InsertVector128( - Unsafe.As>(ref b.B).ToVector256(), - valpha, - 1); + Unsafe.As>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); + Unsafe.As>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); - Vector256 t0 = Avx.UnpackLow(r0, r1); - Vector256 t2 = Avx.UnpackHigh(r0, r1); + Vector256 t4 = Avx.UnpackLow(r2, r3); + Vector256 t6 = Avx.UnpackHigh(r2, r3); - Unsafe.As>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); - Unsafe.As>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); - - Vector256 t4 = Avx.UnpackLow(r2, r3); - Vector256 t6 = Avx.UnpackHigh(r2, r3); - - Unsafe.As>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); - Unsafe.As>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); - } - else + Unsafe.As>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); + Unsafe.As>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); + } #endif - { - this.V0.X = r.A.X; - this.V0.Y = g.A.X; - this.V0.Z = b.A.X; - this.V0.W = 1f; - - this.V1.X = r.A.Y; - this.V1.Y = g.A.Y; - this.V1.Z = b.A.Y; - this.V1.W = 1f; - - this.V2.X = r.A.Z; - this.V2.Y = g.A.Z; - this.V2.Z = b.A.Z; - this.V2.W = 1f; - - this.V3.X = r.A.W; - this.V3.Y = g.A.W; - this.V3.Z = b.A.W; - this.V3.W = 1f; - - this.V4.X = r.B.X; - this.V4.Y = g.B.X; - this.V4.Z = b.B.X; - this.V4.W = 1f; - - this.V5.X = r.B.Y; - this.V5.Y = g.B.Y; - this.V5.Z = b.B.Y; - this.V5.W = 1f; - - this.V6.X = r.B.Z; - this.V6.Y = g.B.Z; - this.V6.Z = b.B.Z; - this.V6.W = 1f; - - this.V7.X = r.B.W; - this.V7.Y = g.B.W; - this.V7.Z = b.B.W; - this.V7.W = 1f; - } + + /// + /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ... + /// + public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) + { + this.V0.X = r.A.X; + this.V0.Y = g.A.X; + this.V0.Z = b.A.X; + this.V0.W = 1f; + + this.V1.X = r.A.Y; + this.V1.Y = g.A.Y; + this.V1.Z = b.A.Y; + this.V1.W = 1f; + + this.V2.X = r.A.Z; + this.V2.Y = g.A.Z; + this.V2.Z = b.A.Z; + this.V2.W = 1f; + + this.V3.X = r.A.W; + this.V3.Y = g.A.W; + this.V3.Z = b.A.W; + this.V3.W = 1f; + + this.V4.X = r.B.X; + this.V4.Y = g.B.X; + this.V4.Z = b.B.X; + this.V4.W = 1f; + + this.V5.X = r.B.Y; + this.V5.Y = g.B.Y; + this.V5.Z = b.B.Y; + this.V5.W = 1f; + + this.V6.X = r.B.Z; + this.V6.Y = g.B.Z; + this.V6.Z = b.B.Z; + this.V6.W = 1f; + + this.V7.X = r.B.W; + this.V7.Y = g.B.W; + this.V7.Z = b.B.W; + this.V7.W = 1f; } } } diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs deleted file mode 100644 index a7ea771988..0000000000 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Apache License, Version 2.0. - -using System.Numerics; -using BenchmarkDotNet.Attributes; -using SixLabors.ImageSharp.Tuples; -using static SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters.JpegColorConverter; - -namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg -{ - [Config(typeof(Config.HwIntrinsics_SSE_AVX))] - public class Vector4OctetPack - { - private static Vector4Pair r = new Vector4Pair - { - A = new Vector4(1, 2, 3, 4), - B = new Vector4(5, 6, 7, 8) - }; - - private static Vector4Pair g = new Vector4Pair - { - A = new Vector4(9, 10, 11, 12), - B = new Vector4(13, 14, 15, 16) - }; - - private static Vector4Pair b = new Vector4Pair - { - A = new Vector4(17, 18, 19, 20), - B = new Vector4(21, 22, 23, 24) - }; - - [Benchmark] - public void Pack() - { - Vector4Octet v = default; - - v.Pack(ref r, ref g, ref b); - } - } -} From 50bc02764398f78ae862ec2b30363cdf3d71f52e Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 23 Oct 2020 18:09:06 +0100 Subject: [PATCH 10/12] Fix access violation --- .../Components/Decoder/ColorConverters/JpegColorConverter.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index 4e96f3471d..b40d9b9e6e 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -204,12 +204,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters in Vector256 vcontrol) { Vector256 r0 = Avx.InsertVector128( - Unsafe.As>(ref r.A), + Unsafe.As>(ref r.A).ToVector256(), Unsafe.As>(ref g.A), 1); Vector256 r1 = Avx.InsertVector128( - Unsafe.As>(ref b.A), + Unsafe.As>(ref b.A).ToVector256(), a, 1); From 40442c24424ca43583700b5577c557f2ba21bd75 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 23 Oct 2020 18:46:00 +0100 Subject: [PATCH 11/12] Inline the packing. --- .../JpegColorConverter.FromYCbCrSimdAvx2.cs | 59 +++++++++++++------ .../ColorConverters/JpegColorConverter.cs | 53 ----------------- 2 files changed, 42 insertions(+), 70 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs index 8c34baa1dc..ca7971a074 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs @@ -59,8 +59,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters ref Vector256 crBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); - ref Vector4Octet resultBase = - ref Unsafe.As(ref MemoryMarshal.GetReference(result)); + ref Vector256 resultBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(result)); // Used for the color conversion var chromaOffset = Vector256.Create(-halfValue); @@ -76,14 +76,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskDeinterleave8x32); Vector256 vcontrol = Unsafe.As>(ref control); - Vector4Pair rr = default; - Vector4Pair gg = default; - Vector4Pair bb = default; - - ref Vector256 rrRefAsVector = ref Unsafe.As>(ref rr); - ref Vector256 ggRefAsVector = ref Unsafe.As>(ref gg); - ref Vector256 bbRefAsVector = ref Unsafe.As>(ref bb); - // Walking 8 elements at one step: int n = result.Length / 8; for (int i = 0; i < n; i++) @@ -107,13 +99,46 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale); b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale); - rrRefAsVector = r; - ggRefAsVector = g; - bbRefAsVector = b; - - // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order: - ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); - destination.PackAvx2(ref rr, ref gg, ref bb, in valpha, in vcontrol); + // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the + // expected (r0,g0,g1,1), (r1,g1,g2,1) ... order: + // + // Left side. + Vector256 r0 = Avx.InsertVector128( + r, + Unsafe.As, Vector128>(ref g), + 1); + + Vector256 r1 = Avx.InsertVector128( + b, + valpha, + 1); + + // Right side + Vector256 r2 = Avx.InsertVector128( + Unsafe.Add(ref Unsafe.As, Vector128>(ref r), 1).ToVector256(), + Unsafe.Add(ref Unsafe.As, Vector128>(ref g), 1), + 1); + + Vector256 r3 = Avx.InsertVector128( + Unsafe.Add(ref Unsafe.As, Vector128>(ref b), 1).ToVector256(), + valpha, + 1); + + // Split into separate rows + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackHigh(r0, r1); + + // Deinterleave and set + ref Vector256 destination = ref Unsafe.Add(ref resultBase, i * 4); + destination = Avx2.PermuteVar8x32(t0, vcontrol); + Unsafe.Add(ref destination, 1) = Avx2.PermuteVar8x32(t2, vcontrol); + + // Repeat for right side. + Vector256 t4 = Avx.UnpackLow(r2, r3); + Vector256 t6 = Avx.UnpackHigh(r2, r3); + + Unsafe.Add(ref destination, 2) = Avx2.PermuteVar8x32(t4, vcontrol); + Unsafe.Add(ref destination, 3) = Avx2.PermuteVar8x32(t6, vcontrol); } #else ref Vector yBase = diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index b40d9b9e6e..7c780700c9 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -4,12 +4,6 @@ using System; using System.Collections.Generic; using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -#if SUPPORTS_RUNTIME_INTRINSICS -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; -#endif using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.Tuples; @@ -190,53 +184,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters #pragma warning disable SA1132 // Do not combine fields public Vector4 V0, V1, V2, V3, V4, V5, V6, V7; -#if SUPPORTS_RUNTIME_INTRINSICS - - /// - /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ... - /// - [MethodImpl(InliningOptions.ShortMethod)] - public void PackAvx2( - ref Vector4Pair r, - ref Vector4Pair g, - ref Vector4Pair b, - in Vector128 a, - in Vector256 vcontrol) - { - Vector256 r0 = Avx.InsertVector128( - Unsafe.As>(ref r.A).ToVector256(), - Unsafe.As>(ref g.A), - 1); - - Vector256 r1 = Avx.InsertVector128( - Unsafe.As>(ref b.A).ToVector256(), - a, - 1); - - Vector256 r2 = Avx.InsertVector128( - Unsafe.As>(ref r.B).ToVector256(), - Unsafe.As>(ref g.B), - 1); - - Vector256 r3 = Avx.InsertVector128( - Unsafe.As>(ref b.B).ToVector256(), - a, - 1); - - Vector256 t0 = Avx.UnpackLow(r0, r1); - Vector256 t2 = Avx.UnpackHigh(r0, r1); - - Unsafe.As>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); - Unsafe.As>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); - - Vector256 t4 = Avx.UnpackLow(r2, r3); - Vector256 t6 = Avx.UnpackHigh(r2, r3); - - Unsafe.As>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); - Unsafe.As>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); - } -#endif - /// /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ... /// From 238564b6096f540e247cfa725e334b21a79da5ab Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 24 Oct 2020 00:55:22 +0100 Subject: [PATCH 12/12] Use less permutes and more multiply/add --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 2 + .../JpegColorConverter.FromYCbCrSimdAvx2.cs | 63 ++++++------------- 2 files changed, 22 insertions(+), 43 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index c5a7f5e909..2d788992ee 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -16,6 +16,8 @@ namespace SixLabors.ImageSharp { public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; + /// /// Performs a multiplication and an addition of the . /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs index ca7971a074..1319b56ee0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs @@ -66,14 +66,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters var chromaOffset = Vector256.Create(-halfValue); var scale = Vector256.Create(1 / maxValue); var rCrMult = Vector256.Create(1.402F); - var gCbMult = Vector256.Create(0.344136F); - var gCrMult = Vector256.Create(0.714136F); + var gCbMult = Vector256.Create(-0.344136F); + var gCrMult = Vector256.Create(-0.714136F); var bCbMult = Vector256.Create(1.772F); // Used for packing. - Vector4 vo = Vector4.One; - Vector128 valpha = Unsafe.As>(ref vo); - ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskDeinterleave8x32); + var va = Vector256.Create(1F); + ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32); Vector256 vcontrol = Unsafe.As>(ref control); // Walking 8 elements at one step: @@ -87,58 +86,36 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters Vector256 cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset); Vector256 cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset); + y = Avx2.PermuteVar8x32(y, vcontrol); + cb = Avx2.PermuteVar8x32(cb, vcontrol); + cr = Avx2.PermuteVar8x32(cr, vcontrol); + // r = y + (1.402F * cr); // g = y - (0.344136F * cb) - (0.714136F * cr); // b = y + (1.772F * cb); // Adding & multiplying 8 elements at one time: Vector256 r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult); - Vector256 g = Avx.Subtract(Avx.Subtract(y, Avx.Multiply(cb, gCbMult)), Avx.Multiply(cr, gCrMult)); + Vector256 g = HwIntrinsics.MultiplyAdd(HwIntrinsics.MultiplyAdd(y, cb, gCbMult), cr, gCrMult); Vector256 b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult); + // TODO: We should be savving to RGBA not Vector4 r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale); g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale); b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale); - // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the - // expected (r0,g0,g1,1), (r1,g1,g2,1) ... order: - // - // Left side. - Vector256 r0 = Avx.InsertVector128( - r, - Unsafe.As, Vector128>(ref g), - 1); - - Vector256 r1 = Avx.InsertVector128( - b, - valpha, - 1); - - // Right side - Vector256 r2 = Avx.InsertVector128( - Unsafe.Add(ref Unsafe.As, Vector128>(ref r), 1).ToVector256(), - Unsafe.Add(ref Unsafe.As, Vector128>(ref g), 1), - 1); - - Vector256 r3 = Avx.InsertVector128( - Unsafe.Add(ref Unsafe.As, Vector128>(ref b), 1).ToVector256(), - valpha, - 1); - - // Split into separate rows - Vector256 t0 = Avx.UnpackLow(r0, r1); - Vector256 t2 = Avx.UnpackHigh(r0, r1); - - // Deinterleave and set + Vector256 vte = Avx.UnpackLow(r, b); + Vector256 vto = Avx.UnpackLow(g, va); + ref Vector256 destination = ref Unsafe.Add(ref resultBase, i * 4); - destination = Avx2.PermuteVar8x32(t0, vcontrol); - Unsafe.Add(ref destination, 1) = Avx2.PermuteVar8x32(t2, vcontrol); - // Repeat for right side. - Vector256 t4 = Avx.UnpackLow(r2, r3); - Vector256 t6 = Avx.UnpackHigh(r2, r3); + destination = Avx.UnpackLow(vte, vto); + Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto); + + vte = Avx.UnpackHigh(r, b); + vto = Avx.UnpackHigh(g, va); - Unsafe.Add(ref destination, 2) = Avx2.PermuteVar8x32(t4, vcontrol); - Unsafe.Add(ref destination, 3) = Avx2.PermuteVar8x32(t6, vcontrol); + Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto); + Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto); } #else ref Vector yBase =