From aecf80388cd4f8a33d709d2ff1f359de9cfa8319 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 21 Oct 2020 17:59:57 +0100 Subject: [PATCH 1/5] Add Avx2 Span Premultiplication and Reverse --- src/ImageSharp/Common/Helpers/ImageMaths.cs | 6 ++ .../Common/Helpers/Vector4Utilities.cs | 80 ++++++++++++++++--- .../Color/Bulk/PremultiplyVector4.cs | 68 ++++++++++++++++ .../Color/Bulk/UnPremultiplyVector4.cs | 68 ++++++++++++++++ .../Helpers/ImageMathsTests.cs | 15 ++++ .../Helpers/Vector4UtilsTests.cs | 2 + 6 files changed, 229 insertions(+), 10 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs create mode 100644 tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs index 977432f8b..d24230fe1 100644 --- a/src/ImageSharp/Common/Helpers/ImageMaths.cs +++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs @@ -132,6 +132,12 @@ namespace SixLabors.ImageSharp return (a / GreatestCommonDivisor(a, b)) * b; } + /// + /// Calculates % 2 + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static int Modulo2(int x) => x & 1; + /// /// Calculates % 4 /// diff --git a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs index fccc50755..848a91791 100644 --- a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs @@ -5,6 +5,10 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp { @@ -13,6 +17,10 @@ namespace SixLabors.ImageSharp /// internal static class Vector4Utilities { + private const int BlendAlphaControl = 0b10001000; + + private static ReadOnlySpan PermuteAlphaMask8x32 => new byte[] { 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0 }; + /// /// Restricts a vector between a minimum and a maximum value. /// 5x Faster then . @@ -56,13 +64,39 @@ namespace SixLabors.ImageSharp [MethodImpl(InliningOptions.ShortMethod)] public static void Premultiply(Span vectors) { - // TODO: This method can be AVX2 optimized using Vector - ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported && vectors.Length >= 2) + { + ref Vector256 vectorsBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); - for (int i = 0; i < vectors.Length; i++) + Vector256 mask = + Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); + + int n = (vectors.Length * 4) / Vector256.Count; + for (int i = 0; i < n; i++) + { + ref Vector256 source = ref Unsafe.Add(ref vectorsBase, i); + Vector256 multiply = Avx2.PermuteVar8x32(source, mask); + source = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl); + } + + if (ImageMaths.Modulo2(vectors.Length) != 0) + { + // Vector4 fits neatly in pairs. Any overlap has to be equal to 1. + Premultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1))); + } + } + else +#endif { - ref Vector4 v = ref Unsafe.Add(ref baseRef, i); - Premultiply(ref v); + ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); + + for (int i = 0; i < vectors.Length; i++) + { + ref Vector4 v = ref Unsafe.Add(ref baseRef, i); + Premultiply(ref v); + } } } @@ -73,13 +107,39 @@ namespace SixLabors.ImageSharp [MethodImpl(InliningOptions.ShortMethod)] public static void UnPremultiply(Span vectors) { - // TODO: This method can be AVX2 optimized using Vector - ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported && vectors.Length >= 2) + { + ref Vector256 vectorsBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); - for (int i = 0; i < vectors.Length; i++) + Vector256 mask = + Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); + + int n = (vectors.Length * 4) / Vector256.Count; + for (int i = 0; i < n; i++) + { + ref Vector256 source = ref Unsafe.Add(ref vectorsBase, i); + Vector256 multiply = Avx2.PermuteVar8x32(source, mask); + source = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl); + } + + if (ImageMaths.Modulo2(vectors.Length) != 0) + { + // Vector4 fits neatly in pairs. Any overlap has to be equal to 1. + UnPremultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1))); + } + } + else +#endif { - ref Vector4 v = ref Unsafe.Add(ref baseRef, i); - UnPremultiply(ref v); + ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); + + for (int i = 0; i < vectors.Length; i++) + { + ref Vector4 v = ref Unsafe.Add(ref baseRef, i); + UnPremultiply(ref v); + } } } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs new file mode 100644 index 000000000..2a886c687 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs @@ -0,0 +1,68 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.ShortCore31))] + public class PremultiplyVector4 + { + private static readonly Vector4[] Vectors = CreateVectors(); + + [Benchmark(Baseline = true)] + public void PremultiplyBaseline() + { + ref Vector4 baseRef = ref MemoryMarshal.GetReference(Vectors); + + for (int i = 0; i < Vectors.Length; i++) + { + ref Vector4 v = ref Unsafe.Add(ref baseRef, i); + Premultiply(ref v); + } + } + + [Benchmark] + public void Premultiply() + { + Vector4Utilities.Premultiply(Vectors); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void Premultiply(ref Vector4 source) + { + float w = source.W; + source *= w; + source.W = w; + } + + private static Vector4[] CreateVectors() + { + var rnd = new Random(42); + return GenerateRandomVectorArray(rnd, 2048, 0, 1); + } + + private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal) + { + var values = new Vector4[length]; + + for (int i = 0; i < length; i++) + { + ref Vector4 v = ref values[i]; + v.X = GetRandomFloat(rnd, minVal, maxVal); + v.Y = GetRandomFloat(rnd, minVal, maxVal); + v.Z = GetRandomFloat(rnd, minVal, maxVal); + v.W = GetRandomFloat(rnd, minVal, maxVal); + } + + return values; + } + + private static float GetRandomFloat(Random rnd, float minVal, float maxVal) + => ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal; + } +} diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs new file mode 100644 index 000000000..89e055da4 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs @@ -0,0 +1,68 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.ShortCore31))] + public class UnPremultiplyVector4 + { + private static readonly Vector4[] Vectors = CreateVectors(); + + [Benchmark(Baseline = true)] + public void UnPremultiplyBaseline() + { + ref Vector4 baseRef = ref MemoryMarshal.GetReference(Vectors); + + for (int i = 0; i < Vectors.Length; i++) + { + ref Vector4 v = ref Unsafe.Add(ref baseRef, i); + UnPremultiply(ref v); + } + } + + [Benchmark] + public void UnPremultiply() + { + Vector4Utilities.UnPremultiply(Vectors); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void UnPremultiply(ref Vector4 source) + { + float w = source.W; + source *= w; + source.W = w; + } + + private static Vector4[] CreateVectors() + { + var rnd = new Random(42); + return GenerateRandomVectorArray(rnd, 2048, 0, 1); + } + + private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal) + { + var values = new Vector4[length]; + + for (int i = 0; i < length; i++) + { + ref Vector4 v = ref values[i]; + v.X = GetRandomFloat(rnd, minVal, maxVal); + v.Y = GetRandomFloat(rnd, minVal, maxVal); + v.Z = GetRandomFloat(rnd, minVal, maxVal); + v.W = GetRandomFloat(rnd, minVal, maxVal); + } + + return values; + } + + private static float GetRandomFloat(Random rnd, float minVal, float maxVal) + => ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal; + } +} diff --git a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs index 27689f681..7d1662387 100644 --- a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs +++ b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs @@ -10,6 +10,21 @@ namespace SixLabors.ImageSharp.Tests.Helpers { public class ImageMathsTests { + [Theory] + [InlineData(0)] + [InlineData(1)] + [InlineData(2)] + [InlineData(3)] + [InlineData(4)] + [InlineData(100)] + [InlineData(123)] + [InlineData(53436353)] + public void Modulo2(int x) + { + int actual = ImageMaths.Modulo2(x); + Assert.Equal(x % 2, actual); + } + [Theory] [InlineData(0)] [InlineData(1)] diff --git a/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs b/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs index c3b8e79ee..2bb43c440 100644 --- a/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs +++ b/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs @@ -17,6 +17,7 @@ namespace SixLabors.ImageSharp.Tests.Helpers [InlineData(0)] [InlineData(1)] [InlineData(30)] + [InlineData(63)] public void Premultiply_VectorSpan(int length) { var rnd = new Random(42); @@ -36,6 +37,7 @@ namespace SixLabors.ImageSharp.Tests.Helpers [InlineData(0)] [InlineData(1)] [InlineData(30)] + [InlineData(63)] public void UnPremultiply_VectorSpan(int length) { var rnd = new Random(42); From 1067acbe4c57ba7fc601186ad58b1087380a8a69 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 21 Oct 2020 22:22:47 +0100 Subject: [PATCH 2/5] Use Tanner's updated code. --- .../Common/Helpers/Vector4Utilities.cs | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs index 848a91791..5ae7ac1b7 100644 --- a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs @@ -61,7 +61,7 @@ namespace SixLabors.ImageSharp /// Bulk variant of /// /// The span of vectors - [MethodImpl(InliningOptions.ShortMethod)] + [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] public static void Premultiply(Span vectors) { #if SUPPORTS_RUNTIME_INTRINSICS @@ -73,12 +73,15 @@ namespace SixLabors.ImageSharp Vector256 mask = Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); - int n = (vectors.Length * 4) / Vector256.Count; - for (int i = 0; i < n; i++) + // Divide by 2 as 4 elements per Vector4 and 8 per Vector256 + ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u)); + + while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast)) { - ref Vector256 source = ref Unsafe.Add(ref vectorsBase, i); + Vector256 source = vectorsBase; Vector256 multiply = Avx2.PermuteVar8x32(source, mask); - source = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl); + vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl); + vectorsBase = ref Unsafe.Add(ref vectorsBase, 1); } if (ImageMaths.Modulo2(vectors.Length) != 0) @@ -104,7 +107,7 @@ namespace SixLabors.ImageSharp /// Bulk variant of /// /// The span of vectors - [MethodImpl(InliningOptions.ShortMethod)] + [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] public static void UnPremultiply(Span vectors) { #if SUPPORTS_RUNTIME_INTRINSICS @@ -116,12 +119,15 @@ namespace SixLabors.ImageSharp Vector256 mask = Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); - int n = (vectors.Length * 4) / Vector256.Count; - for (int i = 0; i < n; i++) + // Divide by 2 as 4 elements per Vector4 and 8 per Vector256 + ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u)); + + while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast)) { - ref Vector256 source = ref Unsafe.Add(ref vectorsBase, i); + Vector256 source = vectorsBase; Vector256 multiply = Avx2.PermuteVar8x32(source, mask); - source = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl); + vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl); + vectorsBase = ref Unsafe.Add(ref vectorsBase, 1); } if (ImageMaths.Modulo2(vectors.Length) != 0) From d4e0bdd7b7949072c6bc47e07301fce8ab5a96af Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 21 Oct 2020 23:59:37 +0100 Subject: [PATCH 3/5] Remove hotpath attr --- src/ImageSharp/Common/Helpers/Vector4Utilities.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs index 5ae7ac1b7..0137d0256 100644 --- a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs @@ -61,7 +61,7 @@ namespace SixLabors.ImageSharp /// Bulk variant of /// /// The span of vectors - [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] + [MethodImpl(InliningOptions.ShortMethod)] public static void Premultiply(Span vectors) { #if SUPPORTS_RUNTIME_INTRINSICS @@ -107,7 +107,7 @@ namespace SixLabors.ImageSharp /// Bulk variant of /// /// The span of vectors - [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] + [MethodImpl(InliningOptions.ShortMethod)] public static void UnPremultiply(Span vectors) { #if SUPPORTS_RUNTIME_INTRINSICS From e3faadbf2edac8a51d09bf593088f42a073bd60b Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 22 Oct 2020 10:34:42 +0100 Subject: [PATCH 4/5] Use Avx.Shuffle for lower latency --- src/ImageSharp/Common/Helpers/Vector4Utilities.cs | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs index 0137d0256..f617e9a3e 100644 --- a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs @@ -17,9 +17,8 @@ namespace SixLabors.ImageSharp /// internal static class Vector4Utilities { - private const int BlendAlphaControl = 0b10001000; - - private static ReadOnlySpan PermuteAlphaMask8x32 => new byte[] { 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0 }; + private const int BlendAlphaControl = 0b_10_00_10_00; + private const int ShuffleAlphaControl = 0b_11_11_11_11; /// /// Restricts a vector between a minimum and a maximum value. @@ -70,16 +69,13 @@ namespace SixLabors.ImageSharp ref Vector256 vectorsBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); - Vector256 mask = - Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); - // Divide by 2 as 4 elements per Vector4 and 8 per Vector256 ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u)); while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast)) { Vector256 source = vectorsBase; - Vector256 multiply = Avx2.PermuteVar8x32(source, mask); + Vector256 multiply = Avx.Shuffle(source, source, ShuffleAlphaControl); vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl); vectorsBase = ref Unsafe.Add(ref vectorsBase, 1); } @@ -116,16 +112,13 @@ namespace SixLabors.ImageSharp ref Vector256 vectorsBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); - Vector256 mask = - Unsafe.As>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32)); - // Divide by 2 as 4 elements per Vector4 and 8 per Vector256 ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u)); while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast)) { Vector256 source = vectorsBase; - Vector256 multiply = Avx2.PermuteVar8x32(source, mask); + Vector256 multiply = Avx.Shuffle(source, source, ShuffleAlphaControl); vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl); vectorsBase = ref Unsafe.Add(ref vectorsBase, 1); } From 05b66da9f79a8faba536d3614469d7b477e93eaa Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 22 Oct 2020 10:53:14 +0100 Subject: [PATCH 5/5] Fix base unpremultiply benchmark --- tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs index 89e055da4..1312c767b 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs @@ -36,7 +36,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk private static void UnPremultiply(ref Vector4 source) { float w = source.W; - source *= w; + source /= w; source.W = w; }