diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs
index 977432f8bb..d24230fe18 100644
--- a/src/ImageSharp/Common/Helpers/ImageMaths.cs
+++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs
@@ -132,6 +132,12 @@ namespace SixLabors.ImageSharp
return (a / GreatestCommonDivisor(a, b)) * b;
}
+ ///
+ /// Calculates % 2
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static int Modulo2(int x) => x & 1;
+
///
/// Calculates % 4
///
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
deleted file mode 100644
index b56c92dab7..0000000000
--- a/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-#if SUPPORTS_RUNTIME_INTRINSICS
-
-using System;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-
-namespace SixLabors.ImageSharp
-{
- internal static partial class SimdUtils
- {
- public static class Avx2Intrinsics
- {
- private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
-
- ///
- /// as many elements as possible, slicing them down (keeping the remainder).
- ///
- [MethodImpl(InliningOptions.ShortMethod)]
- internal static void NormalizedFloatToByteSaturateReduce(
- ref ReadOnlySpan source,
- ref Span dest)
- {
- DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
- if (Avx2.IsSupported)
- {
- int remainder = ImageMaths.ModuloP2(source.Length, Vector.Count);
- int adjustedCount = source.Length - remainder;
-
- if (adjustedCount > 0)
- {
- NormalizedFloatToByteSaturate(
- source.Slice(0, adjustedCount),
- dest.Slice(0, adjustedCount));
-
- source = source.Slice(adjustedCount);
- dest = dest.Slice(adjustedCount);
- }
- }
- }
-
- ///
- /// Implementation of , which is faster on new .NET runtime.
- ///
- ///
- /// Implementation is based on MagicScaler code:
- /// https://github.com/saucecontrol/PhotoSauce/blob/a9bd6e5162d2160419f0cf743fd4f536c079170b/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L453-L477
- ///
- internal static void NormalizedFloatToByteSaturate(
- ReadOnlySpan source,
- Span dest)
- {
- VerifySpanInput(source, dest, Vector256.Count);
-
- int n = dest.Length / Vector256.Count;
-
- ref Vector256 sourceBase =
- ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
- ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
-
- var maxBytes = Vector256.Create(255f);
- ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
- Vector256 mask = Unsafe.As>(ref maskBase);
-
- for (int i = 0; i < n; i++)
- {
- ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4);
-
- Vector256 f0 = s;
- Vector256 f1 = Unsafe.Add(ref s, 1);
- Vector256 f2 = Unsafe.Add(ref s, 2);
- Vector256 f3 = Unsafe.Add(ref s, 3);
-
- Vector256 w0 = ConvertToInt32(f0, maxBytes);
- Vector256 w1 = ConvertToInt32(f1, maxBytes);
- Vector256 w2 = ConvertToInt32(f2, maxBytes);
- Vector256 w3 = ConvertToInt32(f3, maxBytes);
-
- Vector256 u0 = Avx2.PackSignedSaturate(w0, w1);
- Vector256 u1 = Avx2.PackSignedSaturate(w2, w3);
- Vector256 b = Avx2.PackUnsignedSaturate(u0, u1);
- b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
-
- Unsafe.Add(ref destBase, i) = b;
- }
- }
-
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static Vector256 ConvertToInt32(Vector256 vf, Vector256 scale)
- {
- vf = Avx.Multiply(vf, scale);
- return Avx.ConvertToVector256Int32(vf);
- }
- }
- }
-}
-#endif
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
new file mode 100644
index 0000000000..2d788992ee
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -0,0 +1,301 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp
+{
+ internal static partial class SimdUtils
+ {
+ public static class HwIntrinsics
+ {
+ public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
+
+ public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 };
+
+ ///
+ /// Performs a multiplication and an addition of the .
+ ///
+ /// The vector to add to the intermediate result.
+ /// The first vector to multiply.
+ /// The second vector to multiply.
+ /// The .
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static Vector256 MultiplyAdd(
+ in Vector256 va,
+ in Vector256 vm0,
+ in Vector256 vm1)
+ {
+ if (Fma.IsSupported)
+ {
+ return Fma.MultiplyAdd(vm1, vm0, va);
+ }
+ else
+ {
+ return Avx.Add(Avx.Multiply(vm0, vm1), va);
+ }
+ }
+
+ ///
+ /// as many elements as possible, slicing them down (keeping the remainder).
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ internal static void ByteToNormalizedFloatReduce(
+ ref ReadOnlySpan source,
+ ref Span dest)
+ {
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+ if (Avx2.IsSupported || Sse2.IsSupported)
+ {
+ int remainder;
+ if (Avx2.IsSupported)
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count);
+ }
+ else
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count);
+ }
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Implementation , which is faster on new RyuJIT runtime.
+ ///
+ ///
+ /// Implementation is based on MagicScaler code:
+ /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182
+ ///
+ internal static unsafe void ByteToNormalizedFloat(
+ ReadOnlySpan source,
+ Span dest)
+ {
+ if (Avx2.IsSupported)
+ {
+ VerifySpanInput(source, dest, Vector256.Count);
+
+ int n = dest.Length / Vector256.Count;
+
+ byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ var scale = Vector256.Create(1 / (float)byte.MaxValue);
+
+ for (int i = 0; i < n; i++)
+ {
+ int si = Vector256.Count * i;
+ Vector256 i0 = Avx2.ConvertToVector256Int32(sourceBase + si);
+ Vector256 i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256.Count);
+ Vector256 i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 2));
+ Vector256 i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 3));
+
+ Vector256 f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));
+ Vector256 f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));
+ Vector256 f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));
+ Vector256 f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));
+
+ ref Vector256 d = ref Unsafe.Add(ref destBase, i * 4);
+
+ d = f0;
+ Unsafe.Add(ref d, 1) = f1;
+ Unsafe.Add(ref d, 2) = f2;
+ Unsafe.Add(ref d, 3) = f3;
+ }
+ }
+ else
+ {
+ // Sse
+ VerifySpanInput(source, dest, Vector128.Count);
+
+ int n = dest.Length / Vector128.Count;
+
+ byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ var scale = Vector128.Create(1 / (float)byte.MaxValue);
+ Vector128 zero = Vector128.Zero;
+
+ for (int i = 0; i < n; i++)
+ {
+ int si = Vector128.Count * i;
+
+ Vector128 i0, i1, i2, i3;
+ if (Sse41.IsSupported)
+ {
+ i0 = Sse41.ConvertToVector128Int32(sourceBase + si);
+ i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128.Count);
+ i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 2));
+ i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 3));
+ }
+ else
+ {
+ Vector128 b = Sse2.LoadVector128(sourceBase + si);
+ Vector128 s0 = Sse2.UnpackLow(b, zero).AsInt16();
+ Vector128 s1 = Sse2.UnpackHigh(b, zero).AsInt16();
+
+ i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32();
+ i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32();
+ i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32();
+ i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32();
+ }
+
+ Vector128 f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0));
+ Vector128 f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1));
+ Vector128 f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2));
+ Vector128 f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3));
+
+ ref Vector128 d = ref Unsafe.Add(ref destBase, i * 4);
+
+ d = f0;
+ Unsafe.Add(ref d, 1) = f1;
+ Unsafe.Add(ref d, 2) = f2;
+ Unsafe.Add(ref d, 3) = f3;
+ }
+ }
+ }
+
+ ///
+ /// as many elements as possible, slicing them down (keeping the remainder).
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ internal static void NormalizedFloatToByteSaturateReduce(
+ ref ReadOnlySpan source,
+ ref Span dest)
+ {
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+ if (Avx2.IsSupported || Sse2.IsSupported)
+ {
+ int remainder;
+ if (Avx2.IsSupported)
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count);
+ }
+ else
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count);
+ }
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ NormalizedFloatToByteSaturate(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount));
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Implementation of , which is faster on new .NET runtime.
+ ///
+ ///
+ /// Implementation is based on MagicScaler code:
+ /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622
+ ///
+ internal static void NormalizedFloatToByteSaturate(
+ ReadOnlySpan source,
+ Span dest)
+ {
+ if (Avx2.IsSupported)
+ {
+ VerifySpanInput(source, dest, Vector256.Count);
+
+ int n = dest.Length / Vector256.Count;
+
+ ref Vector256 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ var scale = Vector256.Create((float)byte.MaxValue);
+ ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
+ Vector256 mask = Unsafe.As>(ref maskBase);
+
+ for (int i = 0; i < n; i++)
+ {
+ ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4);
+
+ Vector256 f0 = Avx.Multiply(scale, s);
+ Vector256 f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1));
+ Vector256 f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2));
+ Vector256 f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3));
+
+ Vector256 w0 = Avx.ConvertToVector256Int32(f0);
+ Vector256 w1 = Avx.ConvertToVector256Int32(f1);
+ Vector256 w2 = Avx.ConvertToVector256Int32(f2);
+ Vector256 w3 = Avx.ConvertToVector256Int32(f3);
+
+ Vector256 u0 = Avx2.PackSignedSaturate(w0, w1);
+ Vector256 u1 = Avx2.PackSignedSaturate(w2, w3);
+ Vector256 b = Avx2.PackUnsignedSaturate(u0, u1);
+ b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
+
+ Unsafe.Add(ref destBase, i) = b;
+ }
+ }
+ else
+ {
+ // Sse
+ VerifySpanInput(source, dest, Vector128.Count);
+
+ int n = dest.Length / Vector128.Count;
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ var scale = Vector128.Create((float)byte.MaxValue);
+
+ for (int i = 0; i < n; i++)
+ {
+ ref Vector128 s = ref Unsafe.Add(ref sourceBase, i * 4);
+
+ Vector128 f0 = Sse.Multiply(scale, s);
+ Vector128 f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1));
+ Vector128 f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2));
+ Vector128 f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3));
+
+ Vector128 w0 = Sse2.ConvertToVector128Int32(f0);
+ Vector128 w1 = Sse2.ConvertToVector128Int32(f1);
+ Vector128 w2 = Sse2.ConvertToVector128Int32(f2);
+ Vector128 w3 = Sse2.ConvertToVector128Int32(f3);
+
+ Vector128 u0 = Sse2.PackSignedSaturate(w0, w1);
+ Vector128 u1 = Sse2.PackSignedSaturate(w2, w3);
+
+ Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1);
+ }
+ }
+ }
+ }
+ }
+}
+#endif
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs
index 7f917648dc..df533cedf1 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@@ -79,8 +79,9 @@ namespace SixLabors.ImageSharp
internal static void ByteToNormalizedFloat(ReadOnlySpan source, Span dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
-#if SUPPORTS_EXTENDED_INTRINSICS
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest);
+#elif SUPPORTS_EXTENDED_INTRINSICS
ExtendedIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest);
#else
BasicIntrinsics256.ByteToNormalizedFloatReduce(ref source, ref dest);
@@ -110,7 +111,7 @@ namespace SixLabors.ImageSharp
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
#if SUPPORTS_RUNTIME_INTRINSICS
- Avx2Intrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
+ HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
#elif SUPPORTS_EXTENDED_INTRINSICS
ExtendedIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
#else
diff --git a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs
index fccc50755d..f617e9a3ea 100644
--- a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs
@@ -5,6 +5,10 @@ using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
namespace SixLabors.ImageSharp
{
@@ -13,6 +17,9 @@ namespace SixLabors.ImageSharp
///
internal static class Vector4Utilities
{
+ private const int BlendAlphaControl = 0b_10_00_10_00;
+ private const int ShuffleAlphaControl = 0b_11_11_11_11;
+
///
/// Restricts a vector between a minimum and a maximum value.
/// 5x Faster then .
@@ -56,13 +63,39 @@ namespace SixLabors.ImageSharp
[MethodImpl(InliningOptions.ShortMethod)]
public static void Premultiply(Span vectors)
{
- // TODO: This method can be AVX2 optimized using Vector
- ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx2.IsSupported && vectors.Length >= 2)
+ {
+ ref Vector256 vectorsBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors));
- for (int i = 0; i < vectors.Length; i++)
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
+
+ while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
+ {
+ Vector256 source = vectorsBase;
+ Vector256 multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
+ vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl);
+ vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
+ }
+
+ if (ImageMaths.Modulo2(vectors.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ Premultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
+ }
+ }
+ else
+#endif
{
- ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
- Premultiply(ref v);
+ ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+
+ for (int i = 0; i < vectors.Length; i++)
+ {
+ ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
+ Premultiply(ref v);
+ }
}
}
@@ -73,13 +106,39 @@ namespace SixLabors.ImageSharp
[MethodImpl(InliningOptions.ShortMethod)]
public static void UnPremultiply(Span vectors)
{
- // TODO: This method can be AVX2 optimized using Vector
- ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx2.IsSupported && vectors.Length >= 2)
+ {
+ ref Vector256 vectorsBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors));
- for (int i = 0; i < vectors.Length; i++)
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
+
+ while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
+ {
+ Vector256 source = vectorsBase;
+ Vector256 multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
+ vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl);
+ vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
+ }
+
+ if (ImageMaths.Modulo2(vectors.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ UnPremultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
+ }
+ }
+ else
+#endif
{
- ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
- UnPremultiply(ref v);
+ ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+
+ for (int i = 0; i < vectors.Length; i++)
+ {
+ ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
+ UnPremultiply(ref v);
+ }
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
index c4d1408a2e..1319b56ee0 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
@@ -1,11 +1,15 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
-
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using static SixLabors.ImageSharp.SimdUtils;
+#endif
using SixLabors.ImageSharp.Tuples;
// ReSharper disable ImpureMethodCallOnReadonlyValueField
@@ -47,6 +51,73 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
"JpegColorConverter.FromYCbCrSimd256 can be used only on architecture having 256 byte floating point SIMD registers!");
}
+#if SUPPORTS_RUNTIME_INTRINSICS
+ ref Vector256 yBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0));
+ ref Vector256 cbBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1));
+ ref Vector256 crBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2));
+
+ ref Vector256 resultBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(result));
+
+ // Used for the color conversion
+ var chromaOffset = Vector256.Create(-halfValue);
+ var scale = Vector256.Create(1 / maxValue);
+ var rCrMult = Vector256.Create(1.402F);
+ var gCbMult = Vector256.Create(-0.344136F);
+ var gCrMult = Vector256.Create(-0.714136F);
+ var bCbMult = Vector256.Create(1.772F);
+
+ // Used for packing.
+ var va = Vector256.Create(1F);
+ ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32);
+ Vector256 vcontrol = Unsafe.As>(ref control);
+
+ // Walking 8 elements at one step:
+ int n = result.Length / 8;
+ for (int i = 0; i < n; i++)
+ {
+ // y = yVals[i];
+ // cb = cbVals[i] - 128F;
+ // cr = crVals[i] - 128F;
+ Vector256 y = Unsafe.Add(ref yBase, i);
+ Vector256 cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset);
+ Vector256 cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset);
+
+ y = Avx2.PermuteVar8x32(y, vcontrol);
+ cb = Avx2.PermuteVar8x32(cb, vcontrol);
+ cr = Avx2.PermuteVar8x32(cr, vcontrol);
+
+ // r = y + (1.402F * cr);
+ // g = y - (0.344136F * cb) - (0.714136F * cr);
+ // b = y + (1.772F * cb);
+ // Adding & multiplying 8 elements at one time:
+ Vector256 r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult);
+ Vector256 g = HwIntrinsics.MultiplyAdd(HwIntrinsics.MultiplyAdd(y, cb, gCbMult), cr, gCrMult);
+ Vector256 b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult);
+
+ // TODO: We should be savving to RGBA not Vector4
+ r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale);
+ g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale);
+ b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale);
+
+ Vector256 vte = Avx.UnpackLow(r, b);
+ Vector256 vto = Avx.UnpackLow(g, va);
+
+ ref Vector256 destination = ref Unsafe.Add(ref resultBase, i * 4);
+
+ destination = Avx.UnpackLow(vte, vto);
+ Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto);
+
+ vte = Avx.UnpackHigh(r, b);
+ vto = Avx.UnpackHigh(g, va);
+
+ Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto);
+ Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto);
+ }
+#else
ref Vector yBase =
ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector cbBase =
@@ -104,6 +175,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
destination.Pack(ref rr, ref gg, ref bb);
}
+#endif
}
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
index f68bca0412..7c780700c9 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
@@ -4,7 +4,6 @@
using System;
using System.Collections.Generic;
using System.Numerics;
-
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.Tuples;
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs
index da15da24c7..dc030e07a7 100644
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs
@@ -13,15 +13,13 @@ using System.Runtime.Intrinsics.X86;
#endif
using BenchmarkDotNet.Attributes;
-using BenchmarkDotNet.Environments;
-using BenchmarkDotNet.Jobs;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
{
- [Config(typeof(Config.ShortClr))]
+ [Config(typeof(Config.ShortCore31))]
public abstract class FromVector4
where TPixel : unmanaged, IPixel
{
@@ -104,12 +102,12 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
#if SUPPORTS_RUNTIME_INTRINSICS
[Benchmark]
- public void UseAvx2()
+ public void UseHwIntrinsics()
{
Span sBytes = MemoryMarshal.Cast(this.source.GetSpan());
Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan());
- SimdUtils.Avx2Intrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats);
+ SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats);
}
private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs
new file mode 100644
index 0000000000..2a886c6879
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs
@@ -0,0 +1,68 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using BenchmarkDotNet.Attributes;
+
+namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
+{
+ [Config(typeof(Config.ShortCore31))]
+ public class PremultiplyVector4
+ {
+ private static readonly Vector4[] Vectors = CreateVectors();
+
+ [Benchmark(Baseline = true)]
+ public void PremultiplyBaseline()
+ {
+ ref Vector4 baseRef = ref MemoryMarshal.GetReference(Vectors);
+
+ for (int i = 0; i < Vectors.Length; i++)
+ {
+ ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
+ Premultiply(ref v);
+ }
+ }
+
+ [Benchmark]
+ public void Premultiply()
+ {
+ Vector4Utilities.Premultiply(Vectors);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Premultiply(ref Vector4 source)
+ {
+ float w = source.W;
+ source *= w;
+ source.W = w;
+ }
+
+ private static Vector4[] CreateVectors()
+ {
+ var rnd = new Random(42);
+ return GenerateRandomVectorArray(rnd, 2048, 0, 1);
+ }
+
+ private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal)
+ {
+ var values = new Vector4[length];
+
+ for (int i = 0; i < length; i++)
+ {
+ ref Vector4 v = ref values[i];
+ v.X = GetRandomFloat(rnd, minVal, maxVal);
+ v.Y = GetRandomFloat(rnd, minVal, maxVal);
+ v.Z = GetRandomFloat(rnd, minVal, maxVal);
+ v.W = GetRandomFloat(rnd, minVal, maxVal);
+ }
+
+ return values;
+ }
+
+ private static float GetRandomFloat(Random rnd, float minVal, float maxVal)
+ => ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal;
+ }
+}
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgba32.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgba32.cs
index 145bf9889b..9ae3b073d4 100644
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgba32.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgba32.cs
@@ -13,7 +13,7 @@ using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
{
- [Config(typeof(Config.ShortClr))]
+ [Config(typeof(Config.ShortCore31))]
public class ToVector4_Rgba32 : ToVector4
{
[Benchmark]
@@ -52,6 +52,17 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(sBytes, dFloats);
}
+#if SUPPORTS_RUNTIME_INTRINSICS
+ [Benchmark]
+ public void HwIntrinsics()
+ {
+ Span sBytes = MemoryMarshal.Cast(this.source.GetSpan());
+ Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan());
+
+ SimdUtils.HwIntrinsics.ByteToNormalizedFloat(sBytes, dFloats);
+ }
+#endif
+
// [Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops()
{
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs
new file mode 100644
index 0000000000..1312c767be
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs
@@ -0,0 +1,68 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using BenchmarkDotNet.Attributes;
+
+namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
+{
+ [Config(typeof(Config.ShortCore31))]
+ public class UnPremultiplyVector4
+ {
+ private static readonly Vector4[] Vectors = CreateVectors();
+
+ [Benchmark(Baseline = true)]
+ public void UnPremultiplyBaseline()
+ {
+ ref Vector4 baseRef = ref MemoryMarshal.GetReference(Vectors);
+
+ for (int i = 0; i < Vectors.Length; i++)
+ {
+ ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
+ UnPremultiply(ref v);
+ }
+ }
+
+ [Benchmark]
+ public void UnPremultiply()
+ {
+ Vector4Utilities.UnPremultiply(Vectors);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void UnPremultiply(ref Vector4 source)
+ {
+ float w = source.W;
+ source /= w;
+ source.W = w;
+ }
+
+ private static Vector4[] CreateVectors()
+ {
+ var rnd = new Random(42);
+ return GenerateRandomVectorArray(rnd, 2048, 0, 1);
+ }
+
+ private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal)
+ {
+ var values = new Vector4[length];
+
+ for (int i = 0; i < length; i++)
+ {
+ ref Vector4 v = ref values[i];
+ v.X = GetRandomFloat(rnd, minVal, maxVal);
+ v.Y = GetRandomFloat(rnd, minVal, maxVal);
+ v.Z = GetRandomFloat(rnd, minVal, maxVal);
+ v.W = GetRandomFloat(rnd, minVal, maxVal);
+ }
+
+ return values;
+ }
+
+ private static float GetRandomFloat(Random rnd, float minVal, float maxVal)
+ => ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal;
+ }
+}
diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
index e860c5491f..e8a06bf24e 100644
--- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
+++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
@@ -73,7 +73,9 @@ namespace SixLabors.ImageSharp.Benchmarks
}
#endif
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
- .WithEnvironmentVariables(new EnvironmentVariable(EnableHWIntrinsic, Off))
+ .WithEnvironmentVariables(
+ new EnvironmentVariable(EnableHWIntrinsic, Off),
+ new EnvironmentVariable(FeatureSIMD, Off))
.WithId("No HwIntrinsics"));
}
}
diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
index 6dce489353..838db742a1 100644
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@@ -7,7 +7,7 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Common.Tuples;
-
+using SixLabors.ImageSharp.Tests.TestUtilities;
using Xunit;
using Xunit.Abstractions;
@@ -204,6 +204,25 @@ namespace SixLabors.ImageSharp.Tests.Common
(s, d) => SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(s.Span, d.Span));
}
+#if SUPPORTS_RUNTIME_INTRINSICS
+ [Theory]
+ [MemberData(nameof(ArraySizesDivisibleBy32))]
+ public void HwIntrinsics_BulkConvertByteToNormalizedFloat(int count)
+ {
+ static void RunTest(string serialized)
+ {
+ TestImpl_BulkConvertByteToNormalizedFloat(
+ FeatureTestRunner.Deserialize(serialized),
+ (s, d) => SimdUtils.HwIntrinsics.ByteToNormalizedFloat(s.Span, d.Span));
+ }
+
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(
+ RunTest,
+ HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41,
+ count);
+ }
+#endif
+
[Theory]
[MemberData(nameof(ArbitraryArraySizes))]
public void BulkConvertByteToNormalizedFloat(int count)
@@ -281,16 +300,19 @@ namespace SixLabors.ImageSharp.Tests.Common
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy32))]
- public void Avx2_BulkConvertNormalizedFloatToByteClampOverflows(int count)
+ public void HwIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
- if (!System.Runtime.Intrinsics.X86.Avx2.IsSupported)
+ static void RunTest(string serialized)
{
- return;
+ TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
+ FeatureTestRunner.Deserialize(serialized),
+ (s, d) => SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span));
}
- TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
- count,
- (s, d) => SimdUtils.Avx2Intrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span));
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(
+ RunTest,
+ HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2,
+ count);
}
#endif
diff --git a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
index 27689f6813..7d16623877 100644
--- a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
+++ b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
@@ -10,6 +10,21 @@ namespace SixLabors.ImageSharp.Tests.Helpers
{
public class ImageMathsTests
{
+ [Theory]
+ [InlineData(0)]
+ [InlineData(1)]
+ [InlineData(2)]
+ [InlineData(3)]
+ [InlineData(4)]
+ [InlineData(100)]
+ [InlineData(123)]
+ [InlineData(53436353)]
+ public void Modulo2(int x)
+ {
+ int actual = ImageMaths.Modulo2(x);
+ Assert.Equal(x % 2, actual);
+ }
+
[Theory]
[InlineData(0)]
[InlineData(1)]
diff --git a/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs b/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs
index c3b8e79ee2..2bb43c440b 100644
--- a/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs
+++ b/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs
@@ -17,6 +17,7 @@ namespace SixLabors.ImageSharp.Tests.Helpers
[InlineData(0)]
[InlineData(1)]
[InlineData(30)]
+ [InlineData(63)]
public void Premultiply_VectorSpan(int length)
{
var rnd = new Random(42);
@@ -36,6 +37,7 @@ namespace SixLabors.ImageSharp.Tests.Helpers
[InlineData(0)]
[InlineData(1)]
[InlineData(30)]
+ [InlineData(63)]
public void UnPremultiply_VectorSpan(int length)
{
var rnd = new Random(42);
diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
index eb1714baad..fdba9ce982 100644
--- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
+++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
@@ -33,6 +33,14 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities
where T : IXunitSerializable
=> BasicSerializer.Deserialize(value);
+ ///
+ /// Allows the deserialization of integers passed to the feature test.
+ ///
+ /// The string value to deserialize.
+ /// The value.
+ public static int Deserialize(string value)
+ => Convert.ToInt32(value);
+
///
/// Runs the given test within an environment
/// where the given features.
@@ -201,6 +209,48 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities
}
}
+ ///
+ /// Runs the given test within an environment
+ /// where the given features.
+ ///
+ /// The test action to run.
+ /// The intrinsics features.
+ /// The value to pass as a parameter to the test action.
+ public static void RunWithHwIntrinsicsFeature(
+ Action action,
+ HwIntrinsics intrinsics,
+ int serializable)
+ {
+ if (!RemoteExecutor.IsSupported)
+ {
+ return;
+ }
+
+ foreach (KeyValuePair intrinsic in intrinsics.ToFeatureKeyValueCollection())
+ {
+ var processStartInfo = new ProcessStartInfo();
+ if (intrinsic.Key != HwIntrinsics.AllowAll)
+ {
+ processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0";
+
+ RemoteExecutor.Invoke(
+ action,
+ serializable.ToString(),
+ new RemoteInvokeOptions
+ {
+ StartInfo = processStartInfo
+ })
+ .Dispose();
+ }
+ else
+ {
+ // Since we are running using the default architecture there is no
+ // point creating the overhead of running the action in a separate process.
+ action(serializable.ToString());
+ }
+ }
+ }
+
internal static Dictionary ToFeatureKeyValueCollection(this HwIntrinsics intrinsics)
{
// Loop through and translate the given values into COMPlus equivaluents