diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs new file mode 100644 index 000000000..2fe2f99ac --- /dev/null +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -0,0 +1,276 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +#if SUPPORTS_RUNTIME_INTRINSICS +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + public static class HwIntrinsics + { + private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + + /// + /// as many elements as possible, slicing them down (keeping the remainder). + /// + [MethodImpl(InliningOptions.ShortMethod)] + internal static void ByteToNormalizedFloatReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); + + if (Avx2.IsSupported || Sse2.IsSupported) + { + int remainder; + if (Avx2.IsSupported) + { + remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count); + } + else + { + remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); + } + + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount)); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + } + + /// + /// Implementation , which is faster on new RyuJIT runtime. + /// + /// + /// Implementation is based on MagicScaler code: + /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182 + /// + internal static unsafe void ByteToNormalizedFloat( + ReadOnlySpan source, + Span dest) + { + if (Avx2.IsSupported) + { + VerifySpanInput(source, dest, Vector256.Count); + + int n = dest.Length / Vector256.Count; + + byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)); + + ref Vector256 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var scale = Vector256.Create(1 / (float)byte.MaxValue); + + for (int i = 0; i < n; i++) + { + int si = Vector256.Count * i; + Vector256 i0 = Avx2.ConvertToVector256Int32(sourceBase + si); + Vector256 i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256.Count); + Vector256 i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 2)); + Vector256 i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 3)); + + Vector256 f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0)); + Vector256 f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1)); + Vector256 f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2)); + Vector256 f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3)); + + ref Vector256 d = ref Unsafe.Add(ref destBase, i * 4); + + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; + } + } + else + { + // Sse + VerifySpanInput(source, dest, Vector128.Count); + + int n = dest.Length / Vector128.Count; + + byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)); + + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var scale = Vector128.Create(1 / (float)byte.MaxValue); + Vector128 zero = Vector128.Zero; + + for (int i = 0; i < n; i++) + { + int si = Vector128.Count * i; + + Vector128 i0, i1, i2, i3; + if (Sse41.IsSupported) + { + i0 = Sse41.ConvertToVector128Int32(sourceBase + si); + i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128.Count); + i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 2)); + i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 3)); + } + else + { + Vector128 b = Sse2.LoadVector128(sourceBase + si); + Vector128 s0 = Sse2.UnpackLow(b, zero).AsInt16(); + Vector128 s1 = Sse2.UnpackHigh(b, zero).AsInt16(); + + i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32(); + i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32(); + i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32(); + i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32(); + } + + Vector128 f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0)); + Vector128 f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1)); + Vector128 f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2)); + Vector128 f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3)); + + ref Vector128 d = ref Unsafe.Add(ref destBase, i * 4); + + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; + } + } + } + + /// + /// as many elements as possible, slicing them down (keeping the remainder). + /// + [MethodImpl(InliningOptions.ShortMethod)] + internal static void NormalizedFloatToByteSaturateReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); + + if (Avx2.IsSupported || Sse2.IsSupported) + { + int remainder; + if (Avx2.IsSupported) + { + remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count); + } + else + { + remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); + } + + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + NormalizedFloatToByteSaturate( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount)); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + } + + /// + /// Implementation of , which is faster on new .NET runtime. + /// + /// + /// Implementation is based on MagicScaler code: + /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622 + /// + internal static void NormalizedFloatToByteSaturate( + ReadOnlySpan source, + Span dest) + { + if (Avx2.IsSupported) + { + VerifySpanInput(source, dest, Vector256.Count); + + int n = dest.Length / Vector256.Count; + + ref Vector256 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector256 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var scale = Vector256.Create((float)byte.MaxValue); + ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); + Vector256 mask = Unsafe.As>(ref maskBase); + + for (int i = 0; i < n; i++) + { + ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4); + + Vector256 f0 = Avx.Multiply(scale, s); + Vector256 f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1)); + Vector256 f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2)); + Vector256 f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3)); + + Vector256 w0 = Avx.ConvertToVector256Int32(f0); + Vector256 w1 = Avx.ConvertToVector256Int32(f1); + Vector256 w2 = Avx.ConvertToVector256Int32(f2); + Vector256 w3 = Avx.ConvertToVector256Int32(f3); + + Vector256 u0 = Avx2.PackSignedSaturate(w0, w1); + Vector256 u1 = Avx2.PackSignedSaturate(w2, w3); + Vector256 b = Avx2.PackUnsignedSaturate(u0, u1); + b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); + + Unsafe.Add(ref destBase, i) = b; + } + } + else + { + // Sse + VerifySpanInput(source, dest, Vector128.Count); + + int n = dest.Length / Vector128.Count; + + ref Vector128 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var scale = Vector128.Create((float)byte.MaxValue); + + for (int i = 0; i < n; i++) + { + ref Vector128 s = ref Unsafe.Add(ref sourceBase, i * 4); + + Vector128 f0 = Sse.Multiply(scale, s); + Vector128 f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1)); + Vector128 f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2)); + Vector128 f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3)); + + Vector128 w0 = Sse2.ConvertToVector128Int32(f0); + Vector128 w1 = Sse2.ConvertToVector128Int32(f1); + Vector128 w2 = Sse2.ConvertToVector128Int32(f2); + Vector128 w3 = Sse2.ConvertToVector128Int32(f3); + + Vector128 u0 = Sse2.PackSignedSaturate(w0, w1); + Vector128 u1 = Sse2.PackSignedSaturate(w2, w3); + + Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1); + } + } + } + } + } +} +#endif diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index 7f917648d..df533cedf 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -79,8 +79,9 @@ namespace SixLabors.ImageSharp internal static void ByteToNormalizedFloat(ReadOnlySpan source, Span dest) { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); - -#if SUPPORTS_EXTENDED_INTRINSICS +#if SUPPORTS_RUNTIME_INTRINSICS + HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest); +#elif SUPPORTS_EXTENDED_INTRINSICS ExtendedIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest); #else BasicIntrinsics256.ByteToNormalizedFloatReduce(ref source, ref dest); @@ -110,7 +111,7 @@ namespace SixLabors.ImageSharp DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); #if SUPPORTS_RUNTIME_INTRINSICS - Avx2Intrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest); + HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest); #elif SUPPORTS_EXTENDED_INTRINSICS ExtendedIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest); #else diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs index da15da24c..267bca4ad 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs @@ -104,12 +104,12 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk #if SUPPORTS_RUNTIME_INTRINSICS [Benchmark] - public void UseAvx2() + public void UseHwIntrinsics() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - SimdUtils.Avx2Intrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats); + SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats); } private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 6dce48935..eca4e72cb 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -204,6 +204,17 @@ namespace SixLabors.ImageSharp.Tests.Common (s, d) => SimdUtils.ExtendedIntrinsics.ByteToNormalizedFloat(s.Span, d.Span)); } +#if SUPPORTS_RUNTIME_INTRINSICS + [Theory] + [MemberData(nameof(ArraySizesDivisibleBy32))] + public void HwIntrinsics_BulkConvertByteToNormalizedFloat(int count) + { + TestImpl_BulkConvertByteToNormalizedFloat( + count, + (s, d) => SimdUtils.HwIntrinsics.ByteToNormalizedFloat(s.Span, d.Span)); + } +#endif + [Theory] [MemberData(nameof(ArbitraryArraySizes))] public void BulkConvertByteToNormalizedFloat(int count) @@ -281,16 +292,11 @@ namespace SixLabors.ImageSharp.Tests.Common [Theory] [MemberData(nameof(ArraySizesDivisibleBy32))] - public void Avx2_BulkConvertNormalizedFloatToByteClampOverflows(int count) + public void HwIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) { - if (!System.Runtime.Intrinsics.X86.Avx2.IsSupported) - { - return; - } - TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( count, - (s, d) => SimdUtils.Avx2Intrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span)); + (s, d) => SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span)); } #endif