diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 0e093a834..b618e1e65 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -15,13 +15,17 @@ jobs:
matrix:
options:
- os: ubuntu-latest
+ framework: netcoreapp3.1
+ runtime: -x64
+ codecov: true
+ - os: macos-latest
framework: netcoreapp3.1
runtime: -x64
codecov: false
- os: windows-latest
framework: netcoreapp3.1
runtime: -x64
- codecov: true
+ codecov: false
- os: windows-latest
framework: netcoreapp2.1
runtime: -x64
diff --git a/.runsettings b/.runsettings
new file mode 100644
index 000000000..ca48342bd
--- /dev/null
+++ b/.runsettings
@@ -0,0 +1,7 @@
+
+
+
+
+ category!=failing
+
+
diff --git a/Directory.Build.props b/Directory.Build.props
index 0f9c5bdde..bb97810a8 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -15,6 +15,7 @@
$(MSBuildThisFileDirectory)artifacts/
$(SixLaborsProjectCategory)/$(MSBuildProjectName)
https://github.com/SixLabors/ImageSharp/
+ $(MSBuildThisFileDirectory)/.runsettings
@@ -120,6 +121,7 @@
https://api.nuget.org/v3/index.json;
https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-eng/nuget/v3/index.json;
+ https://www.myget.org/F/coverlet-dev/api/v3/index.json;
true
$(MSBuildThisFileDirectory)shared-infrastructure/SixLabors.snk
diff --git a/Directory.Build.targets b/Directory.Build.targets
index 4e7ab9e6b..2a7d25b97 100644
--- a/Directory.Build.targets
+++ b/Directory.Build.targets
@@ -18,22 +18,18 @@
-
+
-
+
-
-
+
diff --git a/ImageSharp.sln b/ImageSharp.sln
index 3ebc9453f..a8e0fd330 100644
--- a/ImageSharp.sln
+++ b/ImageSharp.sln
@@ -8,6 +8,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
.gitattributes = .gitattributes
.gitignore = .gitignore
.gitmodules = .gitmodules
+ .runsettings = .runsettings
ci-build.ps1 = ci-build.ps1
ci-pack.ps1 = ci-pack.ps1
ci-test.ps1 = ci-test.ps1
diff --git a/src/Directory.Build.targets b/src/Directory.Build.targets
index d1875262d..9b8be05b5 100644
--- a/src/Directory.Build.targets
+++ b/src/Directory.Build.targets
@@ -21,16 +21,25 @@
-
+
+ $([System.IO.Path]::Combine('$(IntermediateOutputPath)','$(TargetFrameworkMoniker).AssemblyAttributes$(DefaultLanguageSourceExtension)'))
+
+
+
+
+
+
+
+
+ DependsOnTargets="InitializeSourceRootMappedPaths"
+ Returns="@(_LocalTopLevelSourceRoot)"
+ Condition="'$(DeterministicSourcePaths)' == 'true'">
<_LocalTopLevelSourceRoot Include="@(SourceRoot)" Condition="'%(SourceRoot.NestedRoot)' == ''"/>
-
+
false
@@ -62,7 +71,7 @@
-
+
@@ -74,7 +83,7 @@
SkipUnchangedFiles = "true"
DestinationFolder="..\..\" />
-
+
-
+
diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs
index 977432f8b..d24230fe1 100644
--- a/src/ImageSharp/Common/Helpers/ImageMaths.cs
+++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs
@@ -132,6 +132,12 @@ namespace SixLabors.ImageSharp
return (a / GreatestCommonDivisor(a, b)) * b;
}
+ ///
+ /// Calculates % 2
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static int Modulo2(int x) => x & 1;
+
///
/// Calculates % 4
///
diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs
new file mode 100644
index 000000000..7687a5b95
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs
@@ -0,0 +1,193 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Buffers.Binary;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// The JIT can detect and optimize rotation idioms ROTL (Rotate Left)
+// and ROTR (Rotate Right) emitting efficient CPU instructions:
+// https://github.com/dotnet/coreclr/pull/1830
+namespace SixLabors.ImageSharp
+{
+ ///
+ /// Defines the contract for methods that allow the shuffling of pixel components.
+ /// Used for shuffling on platforms that do not support Hardware Intrinsics.
+ ///
+ internal interface IComponentShuffle
+ {
+ ///
+ /// Gets the shuffle control.
+ ///
+ byte Control { get; }
+
+ ///
+ /// Shuffle 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ void RunFallbackShuffle(ReadOnlySpan source, Span dest);
+ }
+
+ ///
+ internal interface IShuffle4 : IComponentShuffle
+ {
+ }
+
+ internal readonly struct DefaultShuffle4 : IShuffle4
+ {
+ private readonly byte p3;
+ private readonly byte p2;
+ private readonly byte p1;
+ private readonly byte p0;
+
+ public DefaultShuffle4(byte p3, byte p2, byte p1, byte p0)
+ {
+ DebugGuard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3));
+ DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2));
+ DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1));
+ DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0));
+
+ this.p3 = p3;
+ this.p2 = p2;
+ this.p1 = p1;
+ this.p0 = p0;
+ this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0);
+ }
+
+ public byte Control { get; }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+ int p3 = this.p3;
+ int p2 = this.p2;
+ int p1 = this.p1;
+ int p0 = this.p0;
+
+ for (int i = 0; i < source.Length; i += 4)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i);
+ Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
+ Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
+ Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i);
+ }
+ }
+ }
+
+ internal readonly struct WXYZShuffle4 : IShuffle4
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ int n = source.Length / 4;
+
+ for (int i = 0; i < n; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // ROTL(8, packed) = [Z Y X W]
+ Unsafe.Add(ref dBase, i) = (packed << 8) | (packed >> 24);
+ }
+ }
+ }
+
+ internal readonly struct WZYXShuffle4 : IShuffle4
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ int n = source.Length / 4;
+
+ for (int i = 0; i < n; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // REVERSE(packedArgb) = [X Y Z W]
+ Unsafe.Add(ref dBase, i) = BinaryPrimitives.ReverseEndianness(packed);
+ }
+ }
+ }
+
+ internal readonly struct YZWXShuffle4 : IShuffle4
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ int n = source.Length / 4;
+
+ for (int i = 0; i < n; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // ROTR(8, packedArgb) = [Y Z W X]
+ Unsafe.Add(ref dBase, i) = (packed >> 8) | (packed << 24);
+ }
+ }
+ }
+
+ internal readonly struct ZYXWShuffle4 : IShuffle4
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ int n = source.Length / 4;
+
+ for (int i = 0; i < n; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // tmp1 = [W 0 Y 0]
+ // tmp2 = [0 Z 0 X]
+ // tmp3=ROTL(16, tmp2) = [0 X 0 Z]
+ // tmp1 + tmp3 = [W X Y Z]
+ uint tmp1 = packed & 0xFF00FF00;
+ uint tmp2 = packed & 0x00FF00FF;
+ uint tmp3 = (tmp2 << 16) | (tmp2 >> 16);
+
+ Unsafe.Add(ref dBase, i) = tmp1 + tmp3;
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
new file mode 100644
index 000000000..0c2b1d508
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
@@ -0,0 +1,103 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ ///
+ internal interface IPad3Shuffle4 : IComponentShuffle
+ {
+ }
+
+ internal readonly struct DefaultPad3Shuffle4 : IPad3Shuffle4
+ {
+ private readonly byte p3;
+ private readonly byte p2;
+ private readonly byte p1;
+ private readonly byte p0;
+
+ public DefaultPad3Shuffle4(byte p3, byte p2, byte p1, byte p0)
+ {
+ DebugGuard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3));
+ DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2));
+ DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1));
+ DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0));
+
+ this.p3 = p3;
+ this.p2 = p2;
+ this.p1 = p1;
+ this.p0 = p0;
+ this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0);
+ }
+
+ public byte Control { get; }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+ int p3 = this.p3;
+ int p2 = this.p2;
+ int p1 = this.p1;
+ int p0 = this.p0;
+
+ Span temp = stackalloc byte[4];
+ ref byte t = ref MemoryMarshal.GetReference(temp);
+ ref uint tu = ref Unsafe.As(ref t);
+
+ for (int i = 0, j = 0; i < source.Length; i += 3, j += 4)
+ {
+ ref var s = ref Unsafe.Add(ref sBase, i);
+ tu = Unsafe.As(ref s) | 0xFF000000;
+
+ Unsafe.Add(ref dBase, j) = Unsafe.Add(ref t, p0);
+ Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref t, p1);
+ Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref t, p2);
+ Unsafe.Add(ref dBase, j + 3) = Unsafe.Add(ref t, p3);
+ }
+ }
+ }
+
+ internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+ ref byte sEnd = ref Unsafe.Add(ref sBase, source.Length);
+ ref byte sLoopEnd = ref Unsafe.Subtract(ref sEnd, 4);
+
+ while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd))
+ {
+ Unsafe.As(ref dBase) = Unsafe.As(ref sBase) | 0xFF000000;
+
+ sBase = ref Unsafe.Add(ref sBase, 3);
+ dBase = ref Unsafe.Add(ref dBase, 4);
+ }
+
+ while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd))
+ {
+ Unsafe.Add(ref dBase, 0) = Unsafe.Add(ref sBase, 0);
+ Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1);
+ Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2);
+ Unsafe.Add(ref dBase, 3) = byte.MaxValue;
+
+ sBase = ref Unsafe.Add(ref sBase, 3);
+ dBase = ref Unsafe.Add(ref dBase, 4);
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
new file mode 100644
index 000000000..61e99890e
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
@@ -0,0 +1,53 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ ///
+ internal interface IShuffle3 : IComponentShuffle
+ {
+ }
+
+ internal readonly struct DefaultShuffle3 : IShuffle3
+ {
+ private readonly byte p2;
+ private readonly byte p1;
+ private readonly byte p0;
+
+ public DefaultShuffle3(byte p2, byte p1, byte p0)
+ {
+ DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 2, nameof(p2));
+ DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 2, nameof(p1));
+ DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 2, nameof(p0));
+
+ this.p2 = p2;
+ this.p1 = p1;
+ this.p0 = p0;
+ this.Control = SimdUtils.Shuffle.MmShuffle(3, p2, p1, p0);
+ }
+
+ public byte Control { get; }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+ int p2 = this.p2;
+ int p1 = this.p1;
+ int p0 = this.p0;
+
+ for (int i = 0; i < source.Length; i += 3)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i);
+ Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
+ Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
new file mode 100644
index 000000000..86e4174f1
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
@@ -0,0 +1,101 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ ///
+ internal interface IShuffle4Slice3 : IComponentShuffle
+ {
+ }
+
+ internal readonly struct DefaultShuffle4Slice3 : IShuffle4Slice3
+ {
+ private readonly byte p2;
+ private readonly byte p1;
+ private readonly byte p0;
+
+ public DefaultShuffle4Slice3(byte p3, byte p2, byte p1, byte p0)
+ {
+ DebugGuard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3));
+ DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2));
+ DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1));
+ DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0));
+
+ this.p2 = p2;
+ this.p1 = p1;
+ this.p0 = p0;
+ this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0);
+ }
+
+ public byte Control { get; }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+ int p2 = this.p2;
+ int p1 = this.p1;
+ int p0 = this.p0;
+
+ for (int i = 0, j = 0; i < dest.Length; i += 3, j += 4)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + j);
+ Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j);
+ Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j);
+ }
+ }
+ }
+
+ internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref Byte3 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+
+ int n = source.Length / 4;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ ref uint sLoopEnd = ref Unsafe.Add(ref sBase, u);
+ ref uint sEnd = ref Unsafe.Add(ref sBase, n);
+
+ while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd))
+ {
+ Unsafe.Add(ref dBase, 0) = Unsafe.As(ref Unsafe.Add(ref sBase, 0));
+ Unsafe.Add(ref dBase, 1) = Unsafe.As(ref Unsafe.Add(ref sBase, 1));
+ Unsafe.Add(ref dBase, 2) = Unsafe.As(ref Unsafe.Add(ref sBase, 2));
+ Unsafe.Add(ref dBase, 3) = Unsafe.As(ref Unsafe.Add(ref sBase, 3));
+
+ sBase = ref Unsafe.Add(ref sBase, 4);
+ dBase = ref Unsafe.Add(ref dBase, 4);
+ }
+
+ while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd))
+ {
+ Unsafe.Add(ref dBase, 0) = Unsafe.As(ref Unsafe.Add(ref sBase, 0));
+
+ sBase = ref Unsafe.Add(ref sBase, 1);
+ dBase = ref Unsafe.Add(ref dBase, 1);
+ }
+ }
+ }
+
+ [StructLayout(LayoutKind.Explicit, Size = 3)]
+ internal readonly struct Byte3
+ {
+ }
+}
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
deleted file mode 100644
index b56c92dab..000000000
--- a/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-#if SUPPORTS_RUNTIME_INTRINSICS
-
-using System;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-
-namespace SixLabors.ImageSharp
-{
- internal static partial class SimdUtils
- {
- public static class Avx2Intrinsics
- {
- private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
-
- ///
- /// as many elements as possible, slicing them down (keeping the remainder).
- ///
- [MethodImpl(InliningOptions.ShortMethod)]
- internal static void NormalizedFloatToByteSaturateReduce(
- ref ReadOnlySpan source,
- ref Span dest)
- {
- DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
- if (Avx2.IsSupported)
- {
- int remainder = ImageMaths.ModuloP2(source.Length, Vector.Count);
- int adjustedCount = source.Length - remainder;
-
- if (adjustedCount > 0)
- {
- NormalizedFloatToByteSaturate(
- source.Slice(0, adjustedCount),
- dest.Slice(0, adjustedCount));
-
- source = source.Slice(adjustedCount);
- dest = dest.Slice(adjustedCount);
- }
- }
- }
-
- ///
- /// Implementation of , which is faster on new .NET runtime.
- ///
- ///
- /// Implementation is based on MagicScaler code:
- /// https://github.com/saucecontrol/PhotoSauce/blob/a9bd6e5162d2160419f0cf743fd4f536c079170b/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L453-L477
- ///
- internal static void NormalizedFloatToByteSaturate(
- ReadOnlySpan source,
- Span dest)
- {
- VerifySpanInput(source, dest, Vector256.Count);
-
- int n = dest.Length / Vector256.Count;
-
- ref Vector256 sourceBase =
- ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
- ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
-
- var maxBytes = Vector256.Create(255f);
- ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
- Vector256 mask = Unsafe.As>(ref maskBase);
-
- for (int i = 0; i < n; i++)
- {
- ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4);
-
- Vector256 f0 = s;
- Vector256 f1 = Unsafe.Add(ref s, 1);
- Vector256 f2 = Unsafe.Add(ref s, 2);
- Vector256 f3 = Unsafe.Add(ref s, 3);
-
- Vector256 w0 = ConvertToInt32(f0, maxBytes);
- Vector256 w1 = ConvertToInt32(f1, maxBytes);
- Vector256 w2 = ConvertToInt32(f2, maxBytes);
- Vector256 w3 = ConvertToInt32(f3, maxBytes);
-
- Vector256 u0 = Avx2.PackSignedSaturate(w0, w1);
- Vector256 u1 = Avx2.PackSignedSaturate(w2, w3);
- Vector256 b = Avx2.PackUnsignedSaturate(u0, u1);
- b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
-
- Unsafe.Add(ref destBase, i) = b;
- }
- }
-
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static Vector256 ConvertToInt32(Vector256 vf, Vector256 scale)
- {
- vf = Avx.Multiply(vf, scale);
- return Avx.ConvertToVector256Int32(vf);
- }
- }
- }
-}
-#endif
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
new file mode 100644
index 000000000..2ea7f2c9b
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -0,0 +1,795 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp
+{
+ internal static partial class SimdUtils
+ {
+ public static class HwIntrinsics
+ {
+ public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
+
+ public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 };
+
+ private static ReadOnlySpan ShuffleMaskPad4Nx16 => new byte[] { 0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80 };
+
+ private static ReadOnlySpan ShuffleMaskSlice4Nx16 => new byte[] { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80 };
+
+ ///
+ /// Shuffle single-precision (32-bit) floating-point elements in
+ /// using the control and store the results in .
+ ///
+ /// The source span of floats.
+ /// The destination span of floats.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Reduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Avx.IsSupported || Sse.IsSupported)
+ {
+ int remainder = Avx.IsSupported
+ ? ImageMaths.ModuloP2(source.Length, Vector256.Count)
+ : ImageMaths.ModuloP2(source.Length, Vector128.Count);
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ Shuffle4(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount),
+ control);
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Shuffle 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Reduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Avx2.IsSupported || Ssse3.IsSupported)
+ {
+ int remainder = Avx2.IsSupported
+ ? ImageMaths.ModuloP2(source.Length, Vector256.Count)
+ : ImageMaths.ModuloP2(source.Length, Vector128.Count);
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ Shuffle4(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount),
+ control);
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Shuffles 8-bit integer triplets within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle3Reduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ int remainder = source.Length % (Vector128.Count * 3);
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ Shuffle3(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount),
+ control);
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Pads then shuffles 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Pad3Shuffle4Reduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ int remainder = source.Length % (Vector128.Count * 3);
+
+ int sourceCount = source.Length - remainder;
+ int destCount = sourceCount * 4 / 3;
+
+ if (sourceCount > 0)
+ {
+ Pad3Shuffle4(
+ source.Slice(0, sourceCount),
+ dest.Slice(0, destCount),
+ control);
+
+ source = source.Slice(sourceCount);
+ dest = dest.Slice(destCount);
+ }
+ }
+ }
+
+ ///
+ /// Shuffles then slices 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Slice3Reduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ int remainder = source.Length % (Vector128.Count * 4);
+
+ int sourceCount = source.Length - remainder;
+ int destCount = sourceCount * 3 / 4;
+
+ if (sourceCount > 0)
+ {
+ Shuffle4Slice3(
+ source.Slice(0, sourceCount),
+ dest.Slice(0, destCount),
+ control);
+
+ source = source.Slice(sourceCount);
+ dest = dest.Slice(destCount);
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle4(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Avx.IsSupported)
+ {
+ ref Vector256 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector256.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i);
+ ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
+
+ vd0 = Avx.Permute(vs0, control);
+ Unsafe.Add(ref vd0, 1) = Avx.Permute(Unsafe.Add(ref vs0, 1), control);
+ Unsafe.Add(ref vd0, 2) = Avx.Permute(Unsafe.Add(ref vs0, 2), control);
+ Unsafe.Add(ref vd0, 3) = Avx.Permute(Unsafe.Add(ref vs0, 3), control);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control);
+ }
+ }
+ }
+ else
+ {
+ // Sse
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector128.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i);
+ ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i);
+
+ vd0 = Sse.Shuffle(vs0, vs0, control);
+
+ Vector128 vs1 = Unsafe.Add(ref vs0, 1);
+ Unsafe.Add(ref vd0, 1) = Sse.Shuffle(vs1, vs1, control);
+
+ Vector128 vs2 = Unsafe.Add(ref vs0, 2);
+ Unsafe.Add(ref vd0, 2) = Sse.Shuffle(vs2, vs2, control);
+
+ Vector128 vs3 = Unsafe.Add(ref vs0, 3);
+ Unsafe.Add(ref vd0, 3) = Sse.Shuffle(vs3, vs3, control);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Vector128 vs = Unsafe.Add(ref sourceBase, i);
+ Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control);
+ }
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle4(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Avx2.IsSupported)
+ {
+ // I've chosen to do this for convenience while we determine what
+ // shuffle controls to add to the library.
+ // We can add static ROS instances if need be in the future.
+ Span bytes = stackalloc byte[Vector256.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector256 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector256 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector256.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i);
+
+ vd0 = Avx2.Shuffle(vs0, vshuffle);
+ Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle);
+ Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle);
+ Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle);
+ }
+ }
+ }
+ else
+ {
+ // Ssse3
+ Span bytes = stackalloc byte[Vector128.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector128.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i);
+
+ vd0 = Ssse3.Shuffle(vs0, vshuffle);
+ Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle);
+ Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle);
+ Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle);
+ }
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle3(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16);
+ Vector128 vmask = Unsafe.As>(ref vmaskBase);
+ ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16);
+ Vector128 vmasko = Unsafe.As>(ref vmaskoBase);
+ Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12);
+
+ Span bytes = stackalloc byte[Vector128.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = source.Length / Vector128.Count;
+
+ for (int i = 0; i < n; i += 3)
+ {
+ ref Vector128 vs = ref Unsafe.Add(ref sourceBase, i);
+
+ Vector128 v0 = vs;
+ Vector128 v1 = Unsafe.Add(ref vs, 1);
+ Vector128 v2 = Unsafe.Add(ref vs, 2);
+ Vector128 v3 = Sse2.ShiftRightLogical128BitLane(v2, 4);
+
+ v2 = Ssse3.AlignRight(v2, v1, 8);
+ v1 = Ssse3.AlignRight(v1, v0, 12);
+
+ v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vmask), vshuffle);
+ v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vmask), vshuffle);
+ v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vmask), vshuffle);
+ v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vmask), vshuffle);
+
+ v0 = Ssse3.Shuffle(v0, vmaske);
+ v1 = Ssse3.Shuffle(v1, vmasko);
+ v2 = Ssse3.Shuffle(v2, vmaske);
+ v3 = Ssse3.Shuffle(v3, vmasko);
+
+ v0 = Ssse3.AlignRight(v1, v0, 4);
+ v3 = Ssse3.AlignRight(v3, v2, 12);
+
+ v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4);
+ v2 = Sse2.ShiftRightLogical128BitLane(v2, 4);
+
+ v1 = Ssse3.AlignRight(v2, v1, 8);
+
+ ref Vector128 vd = ref Unsafe.Add(ref destBase, i);
+
+ vd = v0;
+ Unsafe.Add(ref vd, 1) = v1;
+ Unsafe.Add(ref vd, 2) = v3;
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Pad3Shuffle4(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16);
+ Vector128 vmask = Unsafe.As>(ref vmaskBase);
+ Vector128 vfill = Vector128.Create(0xff000000ff000000ul).AsByte();
+
+ Span bytes = stackalloc byte[Vector128.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = source.Length / Vector128.Count;
+
+ for (int i = 0, j = 0; i < n; i += 3, j += 4)
+ {
+ ref Vector128 v0 = ref Unsafe.Add(ref sourceBase, i);
+ Vector128 v1 = Unsafe.Add(ref v0, 1);
+ Vector128 v2 = Unsafe.Add(ref v0, 2);
+ Vector128 v3 = Sse2.ShiftRightLogical128BitLane(v2, 4);
+
+ v2 = Ssse3.AlignRight(v2, v1, 8);
+ v1 = Ssse3.AlignRight(v1, v0, 12);
+
+ ref Vector128 vd = ref Unsafe.Add(ref destBase, j);
+
+ vd = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v0, vmask), vfill), vshuffle);
+ Unsafe.Add(ref vd, 1) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v1, vmask), vfill), vshuffle);
+ Unsafe.Add(ref vd, 2) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v2, vmask), vfill), vshuffle);
+ Unsafe.Add(ref vd, 3) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v3, vmask), vfill), vshuffle);
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle4Slice3(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16);
+ Vector128 vmasko = Unsafe.As>(ref vmaskoBase);
+ Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12);
+
+ Span bytes = stackalloc byte[Vector128.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = source.Length / Vector128.Count;
+
+ for (int i = 0, j = 0; i < n; i += 4, j += 3)
+ {
+ ref Vector128 vs = ref Unsafe.Add(ref sourceBase, i);
+
+ Vector128 v0 = vs;
+ Vector128 v1 = Unsafe.Add(ref vs, 1);
+ Vector128 v2 = Unsafe.Add(ref vs, 2);
+ Vector128 v3 = Unsafe.Add(ref vs, 3);
+
+ v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vshuffle), vmaske);
+ v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vshuffle), vmasko);
+ v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vshuffle), vmaske);
+ v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vshuffle), vmasko);
+
+ v0 = Ssse3.AlignRight(v1, v0, 4);
+ v3 = Ssse3.AlignRight(v3, v2, 12);
+
+ v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4);
+ v2 = Sse2.ShiftRightLogical128BitLane(v2, 4);
+
+ v1 = Ssse3.AlignRight(v2, v1, 8);
+
+ ref Vector128 vd = ref Unsafe.Add(ref destBase, j);
+
+ vd = v0;
+ Unsafe.Add(ref vd, 1) = v1;
+ Unsafe.Add(ref vd, 2) = v3;
+ }
+ }
+ }
+
+ ///
+ /// Performs a multiplication and an addition of the .
+ ///
+ /// The vector to add to the intermediate result.
+ /// The first vector to multiply.
+ /// The second vector to multiply.
+ /// The .
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static Vector256 MultiplyAdd(
+ in Vector256 va,
+ in Vector256 vm0,
+ in Vector256 vm1)
+ {
+ if (Fma.IsSupported)
+ {
+ return Fma.MultiplyAdd(vm1, vm0, va);
+ }
+ else
+ {
+ return Avx.Add(Avx.Multiply(vm0, vm1), va);
+ }
+ }
+
+ ///
+ /// as many elements as possible, slicing them down (keeping the remainder).
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ internal static void ByteToNormalizedFloatReduce(
+ ref ReadOnlySpan source,
+ ref Span dest)
+ {
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+ if (Avx2.IsSupported || Sse2.IsSupported)
+ {
+ int remainder;
+ if (Avx2.IsSupported)
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count);
+ }
+ else
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count);
+ }
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Implementation , which is faster on new RyuJIT runtime.
+ ///
+ ///
+ /// Implementation is based on MagicScaler code:
+ /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182
+ ///
+ internal static unsafe void ByteToNormalizedFloat(
+ ReadOnlySpan source,
+ Span dest)
+ {
+ if (Avx2.IsSupported)
+ {
+ VerifySpanInput(source, dest, Vector256.Count);
+
+ int n = dest.Length / Vector256.Count;
+
+ byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ var scale = Vector256.Create(1 / (float)byte.MaxValue);
+
+ for (int i = 0; i < n; i++)
+ {
+ int si = Vector256.Count * i;
+ Vector256 i0 = Avx2.ConvertToVector256Int32(sourceBase + si);
+ Vector256 i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256.Count);
+ Vector256 i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 2));
+ Vector256 i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 3));
+
+ Vector256 f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));
+ Vector256 f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));
+ Vector256 f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));
+ Vector256 f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));
+
+ ref Vector256 d = ref Unsafe.Add(ref destBase, i * 4);
+
+ d = f0;
+ Unsafe.Add(ref d, 1) = f1;
+ Unsafe.Add(ref d, 2) = f2;
+ Unsafe.Add(ref d, 3) = f3;
+ }
+ }
+ else
+ {
+ // Sse
+ VerifySpanInput(source, dest, Vector128.Count);
+
+ int n = dest.Length / Vector128.Count;
+
+ byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ var scale = Vector128.Create(1 / (float)byte.MaxValue);
+ Vector128 zero = Vector128.Zero;
+
+ for (int i = 0; i < n; i++)
+ {
+ int si = Vector128.Count * i;
+
+ Vector128 i0, i1, i2, i3;
+ if (Sse41.IsSupported)
+ {
+ i0 = Sse41.ConvertToVector128Int32(sourceBase + si);
+ i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128.Count);
+ i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 2));
+ i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 3));
+ }
+ else
+ {
+ Vector128