diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs
new file mode 100644
index 0000000000..7687a5b95f
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs
@@ -0,0 +1,193 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Buffers.Binary;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// The JIT can detect and optimize rotation idioms ROTL (Rotate Left)
+// and ROTR (Rotate Right) emitting efficient CPU instructions:
+// https://github.com/dotnet/coreclr/pull/1830
+namespace SixLabors.ImageSharp
+{
+ ///
+ /// Defines the contract for methods that allow the shuffling of pixel components.
+ /// Used for shuffling on platforms that do not support Hardware Intrinsics.
+ ///
+ internal interface IComponentShuffle
+ {
+ ///
+ /// Gets the shuffle control.
+ ///
+ byte Control { get; }
+
+ ///
+ /// Shuffle 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ void RunFallbackShuffle(ReadOnlySpan source, Span dest);
+ }
+
+ ///
+ internal interface IShuffle4 : IComponentShuffle
+ {
+ }
+
+ internal readonly struct DefaultShuffle4 : IShuffle4
+ {
+ private readonly byte p3;
+ private readonly byte p2;
+ private readonly byte p1;
+ private readonly byte p0;
+
+ public DefaultShuffle4(byte p3, byte p2, byte p1, byte p0)
+ {
+ DebugGuard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3));
+ DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2));
+ DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1));
+ DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0));
+
+ this.p3 = p3;
+ this.p2 = p2;
+ this.p1 = p1;
+ this.p0 = p0;
+ this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0);
+ }
+
+ public byte Control { get; }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+ int p3 = this.p3;
+ int p2 = this.p2;
+ int p1 = this.p1;
+ int p0 = this.p0;
+
+ for (int i = 0; i < source.Length; i += 4)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i);
+ Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
+ Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
+ Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i);
+ }
+ }
+ }
+
+ internal readonly struct WXYZShuffle4 : IShuffle4
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ int n = source.Length / 4;
+
+ for (int i = 0; i < n; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // ROTL(8, packed) = [Z Y X W]
+ Unsafe.Add(ref dBase, i) = (packed << 8) | (packed >> 24);
+ }
+ }
+ }
+
+ internal readonly struct WZYXShuffle4 : IShuffle4
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ int n = source.Length / 4;
+
+ for (int i = 0; i < n; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // REVERSE(packedArgb) = [X Y Z W]
+ Unsafe.Add(ref dBase, i) = BinaryPrimitives.ReverseEndianness(packed);
+ }
+ }
+ }
+
+ internal readonly struct YZWXShuffle4 : IShuffle4
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ int n = source.Length / 4;
+
+ for (int i = 0; i < n; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // ROTR(8, packedArgb) = [Y Z W X]
+ Unsafe.Add(ref dBase, i) = (packed >> 8) | (packed << 24);
+ }
+ }
+ }
+
+ internal readonly struct ZYXWShuffle4 : IShuffle4
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ int n = source.Length / 4;
+
+ for (int i = 0; i < n; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // tmp1 = [W 0 Y 0]
+ // tmp2 = [0 Z 0 X]
+ // tmp3=ROTL(16, tmp2) = [0 X 0 Z]
+ // tmp1 + tmp3 = [W X Y Z]
+ uint tmp1 = packed & 0xFF00FF00;
+ uint tmp2 = packed & 0x00FF00FF;
+ uint tmp3 = (tmp2 << 16) | (tmp2 >> 16);
+
+ Unsafe.Add(ref dBase, i) = tmp1 + tmp3;
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
new file mode 100644
index 0000000000..0c2b1d5082
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
@@ -0,0 +1,103 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ ///
+ internal interface IPad3Shuffle4 : IComponentShuffle
+ {
+ }
+
+ internal readonly struct DefaultPad3Shuffle4 : IPad3Shuffle4
+ {
+ private readonly byte p3;
+ private readonly byte p2;
+ private readonly byte p1;
+ private readonly byte p0;
+
+ public DefaultPad3Shuffle4(byte p3, byte p2, byte p1, byte p0)
+ {
+ DebugGuard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3));
+ DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2));
+ DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1));
+ DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0));
+
+ this.p3 = p3;
+ this.p2 = p2;
+ this.p1 = p1;
+ this.p0 = p0;
+ this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0);
+ }
+
+ public byte Control { get; }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+ int p3 = this.p3;
+ int p2 = this.p2;
+ int p1 = this.p1;
+ int p0 = this.p0;
+
+ Span temp = stackalloc byte[4];
+ ref byte t = ref MemoryMarshal.GetReference(temp);
+ ref uint tu = ref Unsafe.As(ref t);
+
+ for (int i = 0, j = 0; i < source.Length; i += 3, j += 4)
+ {
+ ref var s = ref Unsafe.Add(ref sBase, i);
+ tu = Unsafe.As(ref s) | 0xFF000000;
+
+ Unsafe.Add(ref dBase, j) = Unsafe.Add(ref t, p0);
+ Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref t, p1);
+ Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref t, p2);
+ Unsafe.Add(ref dBase, j + 3) = Unsafe.Add(ref t, p3);
+ }
+ }
+ }
+
+ internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+ ref byte sEnd = ref Unsafe.Add(ref sBase, source.Length);
+ ref byte sLoopEnd = ref Unsafe.Subtract(ref sEnd, 4);
+
+ while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd))
+ {
+ Unsafe.As(ref dBase) = Unsafe.As(ref sBase) | 0xFF000000;
+
+ sBase = ref Unsafe.Add(ref sBase, 3);
+ dBase = ref Unsafe.Add(ref dBase, 4);
+ }
+
+ while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd))
+ {
+ Unsafe.Add(ref dBase, 0) = Unsafe.Add(ref sBase, 0);
+ Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1);
+ Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2);
+ Unsafe.Add(ref dBase, 3) = byte.MaxValue;
+
+ sBase = ref Unsafe.Add(ref sBase, 3);
+ dBase = ref Unsafe.Add(ref dBase, 4);
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
new file mode 100644
index 0000000000..61e99890e7
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
@@ -0,0 +1,53 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ ///
+ internal interface IShuffle3 : IComponentShuffle
+ {
+ }
+
+ internal readonly struct DefaultShuffle3 : IShuffle3
+ {
+ private readonly byte p2;
+ private readonly byte p1;
+ private readonly byte p0;
+
+ public DefaultShuffle3(byte p2, byte p1, byte p0)
+ {
+ DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 2, nameof(p2));
+ DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 2, nameof(p1));
+ DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 2, nameof(p0));
+
+ this.p2 = p2;
+ this.p1 = p1;
+ this.p0 = p0;
+ this.Control = SimdUtils.Shuffle.MmShuffle(3, p2, p1, p0);
+ }
+
+ public byte Control { get; }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+ int p2 = this.p2;
+ int p1 = this.p1;
+ int p0 = this.p0;
+
+ for (int i = 0; i < source.Length; i += 3)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i);
+ Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
+ Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
new file mode 100644
index 0000000000..86e4174f11
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
@@ -0,0 +1,101 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ ///
+ internal interface IShuffle4Slice3 : IComponentShuffle
+ {
+ }
+
+ internal readonly struct DefaultShuffle4Slice3 : IShuffle4Slice3
+ {
+ private readonly byte p2;
+ private readonly byte p1;
+ private readonly byte p0;
+
+ public DefaultShuffle4Slice3(byte p3, byte p2, byte p1, byte p0)
+ {
+ DebugGuard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3));
+ DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2));
+ DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1));
+ DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0));
+
+ this.p2 = p2;
+ this.p1 = p1;
+ this.p0 = p0;
+ this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0);
+ }
+
+ public byte Control { get; }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+ int p2 = this.p2;
+ int p1 = this.p1;
+ int p0 = this.p0;
+
+ for (int i = 0, j = 0; i < dest.Length; i += 3, j += 4)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + j);
+ Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j);
+ Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j);
+ }
+ }
+ }
+
+ internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3
+ {
+ public byte Control
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ get => SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0);
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref Byte3 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+
+ int n = source.Length / 4;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ ref uint sLoopEnd = ref Unsafe.Add(ref sBase, u);
+ ref uint sEnd = ref Unsafe.Add(ref sBase, n);
+
+ while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd))
+ {
+ Unsafe.Add(ref dBase, 0) = Unsafe.As(ref Unsafe.Add(ref sBase, 0));
+ Unsafe.Add(ref dBase, 1) = Unsafe.As(ref Unsafe.Add(ref sBase, 1));
+ Unsafe.Add(ref dBase, 2) = Unsafe.As(ref Unsafe.Add(ref sBase, 2));
+ Unsafe.Add(ref dBase, 3) = Unsafe.As(ref Unsafe.Add(ref sBase, 3));
+
+ sBase = ref Unsafe.Add(ref sBase, 4);
+ dBase = ref Unsafe.Add(ref dBase, 4);
+ }
+
+ while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd))
+ {
+ Unsafe.Add(ref dBase, 0) = Unsafe.As(ref Unsafe.Add(ref sBase, 0));
+
+ sBase = ref Unsafe.Add(ref sBase, 1);
+ dBase = ref Unsafe.Add(ref dBase, 1);
+ }
+ }
+ }
+
+ [StructLayout(LayoutKind.Explicit, Size = 3)]
+ internal readonly struct Byte3
+ {
+ }
+}
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 2d788992ee..2ea7f2c9bd 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -18,6 +18,500 @@ namespace SixLabors.ImageSharp
public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 };
+ private static ReadOnlySpan ShuffleMaskPad4Nx16 => new byte[] { 0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80 };
+
+ private static ReadOnlySpan ShuffleMaskSlice4Nx16 => new byte[] { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80 };
+
+ ///
+ /// Shuffle single-precision (32-bit) floating-point elements in
+ /// using the control and store the results in .
+ ///
+ /// The source span of floats.
+ /// The destination span of floats.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Reduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Avx.IsSupported || Sse.IsSupported)
+ {
+ int remainder = Avx.IsSupported
+ ? ImageMaths.ModuloP2(source.Length, Vector256.Count)
+ : ImageMaths.ModuloP2(source.Length, Vector128.Count);
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ Shuffle4(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount),
+ control);
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Shuffle 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Reduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Avx2.IsSupported || Ssse3.IsSupported)
+ {
+ int remainder = Avx2.IsSupported
+ ? ImageMaths.ModuloP2(source.Length, Vector256.Count)
+ : ImageMaths.ModuloP2(source.Length, Vector128.Count);
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ Shuffle4(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount),
+ control);
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Shuffles 8-bit integer triplets within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle3Reduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ int remainder = source.Length % (Vector128.Count * 3);
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ Shuffle3(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount),
+ control);
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Pads then shuffles 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Pad3Shuffle4Reduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ int remainder = source.Length % (Vector128.Count * 3);
+
+ int sourceCount = source.Length - remainder;
+ int destCount = sourceCount * 4 / 3;
+
+ if (sourceCount > 0)
+ {
+ Pad3Shuffle4(
+ source.Slice(0, sourceCount),
+ dest.Slice(0, destCount),
+ control);
+
+ source = source.Slice(sourceCount);
+ dest = dest.Slice(destCount);
+ }
+ }
+ }
+
+ ///
+ /// Shuffles then slices 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Slice3Reduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ int remainder = source.Length % (Vector128.Count * 4);
+
+ int sourceCount = source.Length - remainder;
+ int destCount = sourceCount * 3 / 4;
+
+ if (sourceCount > 0)
+ {
+ Shuffle4Slice3(
+ source.Slice(0, sourceCount),
+ dest.Slice(0, destCount),
+ control);
+
+ source = source.Slice(sourceCount);
+ dest = dest.Slice(destCount);
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle4(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Avx.IsSupported)
+ {
+ ref Vector256 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector256.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i);
+ ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
+
+ vd0 = Avx.Permute(vs0, control);
+ Unsafe.Add(ref vd0, 1) = Avx.Permute(Unsafe.Add(ref vs0, 1), control);
+ Unsafe.Add(ref vd0, 2) = Avx.Permute(Unsafe.Add(ref vs0, 2), control);
+ Unsafe.Add(ref vd0, 3) = Avx.Permute(Unsafe.Add(ref vs0, 3), control);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control);
+ }
+ }
+ }
+ else
+ {
+ // Sse
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector128.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i);
+ ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i);
+
+ vd0 = Sse.Shuffle(vs0, vs0, control);
+
+ Vector128 vs1 = Unsafe.Add(ref vs0, 1);
+ Unsafe.Add(ref vd0, 1) = Sse.Shuffle(vs1, vs1, control);
+
+ Vector128 vs2 = Unsafe.Add(ref vs0, 2);
+ Unsafe.Add(ref vd0, 2) = Sse.Shuffle(vs2, vs2, control);
+
+ Vector128 vs3 = Unsafe.Add(ref vs0, 3);
+ Unsafe.Add(ref vd0, 3) = Sse.Shuffle(vs3, vs3, control);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Vector128 vs = Unsafe.Add(ref sourceBase, i);
+ Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control);
+ }
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle4(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Avx2.IsSupported)
+ {
+ // I've chosen to do this for convenience while we determine what
+ // shuffle controls to add to the library.
+ // We can add static ROS instances if need be in the future.
+ Span bytes = stackalloc byte[Vector256.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector256 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector256 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector256.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i);
+
+ vd0 = Avx2.Shuffle(vs0, vshuffle);
+ Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle);
+ Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle);
+ Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle);
+ }
+ }
+ }
+ else
+ {
+ // Ssse3
+ Span bytes = stackalloc byte[Vector128.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector128.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i);
+
+ vd0 = Ssse3.Shuffle(vs0, vshuffle);
+ Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle);
+ Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle);
+ Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle);
+ }
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle3(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16);
+ Vector128 vmask = Unsafe.As>(ref vmaskBase);
+ ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16);
+ Vector128 vmasko = Unsafe.As>(ref vmaskoBase);
+ Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12);
+
+ Span bytes = stackalloc byte[Vector128.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = source.Length / Vector128.Count;
+
+ for (int i = 0; i < n; i += 3)
+ {
+ ref Vector128 vs = ref Unsafe.Add(ref sourceBase, i);
+
+ Vector128 v0 = vs;
+ Vector128 v1 = Unsafe.Add(ref vs, 1);
+ Vector128 v2 = Unsafe.Add(ref vs, 2);
+ Vector128 v3 = Sse2.ShiftRightLogical128BitLane(v2, 4);
+
+ v2 = Ssse3.AlignRight(v2, v1, 8);
+ v1 = Ssse3.AlignRight(v1, v0, 12);
+
+ v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vmask), vshuffle);
+ v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vmask), vshuffle);
+ v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vmask), vshuffle);
+ v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vmask), vshuffle);
+
+ v0 = Ssse3.Shuffle(v0, vmaske);
+ v1 = Ssse3.Shuffle(v1, vmasko);
+ v2 = Ssse3.Shuffle(v2, vmaske);
+ v3 = Ssse3.Shuffle(v3, vmasko);
+
+ v0 = Ssse3.AlignRight(v1, v0, 4);
+ v3 = Ssse3.AlignRight(v3, v2, 12);
+
+ v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4);
+ v2 = Sse2.ShiftRightLogical128BitLane(v2, 4);
+
+ v1 = Ssse3.AlignRight(v2, v1, 8);
+
+ ref Vector128 vd = ref Unsafe.Add(ref destBase, i);
+
+ vd = v0;
+ Unsafe.Add(ref vd, 1) = v1;
+ Unsafe.Add(ref vd, 2) = v3;
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Pad3Shuffle4(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16);
+ Vector128 vmask = Unsafe.As>(ref vmaskBase);
+ Vector128 vfill = Vector128.Create(0xff000000ff000000ul).AsByte();
+
+ Span bytes = stackalloc byte[Vector128.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = source.Length / Vector128.Count;
+
+ for (int i = 0, j = 0; i < n; i += 3, j += 4)
+ {
+ ref Vector128 v0 = ref Unsafe.Add(ref sourceBase, i);
+ Vector128 v1 = Unsafe.Add(ref v0, 1);
+ Vector128 v2 = Unsafe.Add(ref v0, 2);
+ Vector128 v3 = Sse2.ShiftRightLogical128BitLane(v2, 4);
+
+ v2 = Ssse3.AlignRight(v2, v1, 8);
+ v1 = Ssse3.AlignRight(v1, v0, 12);
+
+ ref Vector128 vd = ref Unsafe.Add(ref destBase, j);
+
+ vd = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v0, vmask), vfill), vshuffle);
+ Unsafe.Add(ref vd, 1) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v1, vmask), vfill), vshuffle);
+ Unsafe.Add(ref vd, 2) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v2, vmask), vfill), vshuffle);
+ Unsafe.Add(ref vd, 3) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v3, vmask), vfill), vshuffle);
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle4Slice3(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Ssse3.IsSupported)
+ {
+ ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16);
+ Vector128 vmasko = Unsafe.As>(ref vmaskoBase);
+ Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12);
+
+ Span bytes = stackalloc byte[Vector128.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = source.Length / Vector128.Count;
+
+ for (int i = 0, j = 0; i < n; i += 4, j += 3)
+ {
+ ref Vector128 vs = ref Unsafe.Add(ref sourceBase, i);
+
+ Vector128 v0 = vs;
+ Vector128 v1 = Unsafe.Add(ref vs, 1);
+ Vector128 v2 = Unsafe.Add(ref vs, 2);
+ Vector128 v3 = Unsafe.Add(ref vs, 3);
+
+ v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vshuffle), vmaske);
+ v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vshuffle), vmasko);
+ v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vshuffle), vmaske);
+ v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vshuffle), vmasko);
+
+ v0 = Ssse3.AlignRight(v1, v0, 4);
+ v3 = Ssse3.AlignRight(v3, v2, 12);
+
+ v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4);
+ v2 = Sse2.ShiftRightLogical128BitLane(v2, 4);
+
+ v1 = Ssse3.AlignRight(v2, v1, 8);
+
+ ref Vector128 vd = ref Unsafe.Add(ref destBase, j);
+
+ vd = v0;
+ Unsafe.Add(ref vd, 1) = v1;
+ Unsafe.Add(ref vd, 2) = v3;
+ }
+ }
+ }
+
///
/// Performs a multiplication and an addition of the .
///
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
new file mode 100644
index 0000000000..07744566a3
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
@@ -0,0 +1,275 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ internal static partial class SimdUtils
+ {
+ ///
+ /// Shuffle single-precision (32-bit) floating-point elements in
+ /// using the control and store the results in .
+ ///
+ /// The source span of floats.
+ /// The destination span of floats.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ VerifyShuffle4SpanInput(source, dest);
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.Shuffle4Reduce(ref source, ref dest, control);
+#endif
+
+ // Deal with the remainder:
+ if (source.Length > 0)
+ {
+ Shuffle4Remainder(source, dest, control);
+ }
+ }
+
+ ///
+ /// Shuffle 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The type of shuffle to perform.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4(
+ ReadOnlySpan source,
+ Span dest,
+ TShuffle shuffle)
+ where TShuffle : struct, IShuffle4
+ {
+ VerifyShuffle4SpanInput(source, dest);
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.Shuffle4Reduce(ref source, ref dest, shuffle.Control);
+#endif
+
+ // Deal with the remainder:
+ if (source.Length > 0)
+ {
+ shuffle.RunFallbackShuffle(source, dest);
+ }
+ }
+
+ ///
+ /// Shuffle 8-bit integer triplets within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The type of shuffle to perform.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle3(
+ ReadOnlySpan source,
+ Span dest,
+ TShuffle shuffle)
+ where TShuffle : struct, IShuffle3
+ {
+ VerifyShuffle3SpanInput(source, dest);
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.Shuffle3Reduce(ref source, ref dest, shuffle.Control);
+#endif
+
+ // Deal with the remainder:
+ if (source.Length > 0)
+ {
+ shuffle.RunFallbackShuffle(source, dest);
+ }
+ }
+
+ ///
+ /// Pads then shuffles 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The type of shuffle to perform.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Pad3Shuffle4(
+ ReadOnlySpan source,
+ Span dest,
+ TShuffle shuffle)
+ where TShuffle : struct, IPad3Shuffle4
+ {
+ VerifyPad3Shuffle4SpanInput(source, dest);
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, shuffle.Control);
+#endif
+
+ // Deal with the remainder:
+ if (source.Length > 0)
+ {
+ shuffle.RunFallbackShuffle(source, dest);
+ }
+ }
+
+ ///
+ /// Shuffles then slices 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The type of shuffle to perform.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Slice3(
+ ReadOnlySpan source,
+ Span dest,
+ TShuffle shuffle)
+ where TShuffle : struct, IShuffle4Slice3
+ {
+ VerifyShuffle4Slice3SpanInput(source, dest);
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.Shuffle4Slice3Reduce(ref source, ref dest, shuffle.Control);
+#endif
+
+ // Deal with the remainder:
+ if (source.Length > 0)
+ {
+ shuffle.RunFallbackShuffle(source, dest);
+ }
+ }
+
+ private static void Shuffle4Remainder(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ ref float sBase = ref MemoryMarshal.GetReference(source);
+ ref float dBase = ref MemoryMarshal.GetReference(dest);
+ Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0);
+
+ for (int i = 0; i < source.Length; i += 4)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i);
+ Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
+ Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
+ Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i);
+ }
+ }
+
+ [Conditional("DEBUG")]
+ private static void VerifyShuffle4SpanInput(ReadOnlySpan source, Span dest)
+ where T : struct
+ {
+ DebugGuard.IsTrue(
+ source.Length == dest.Length,
+ nameof(source),
+ "Input spans must be of same length!");
+
+ DebugGuard.IsTrue(
+ source.Length % 4 == 0,
+ nameof(source),
+ "Input spans must be divisable by 4!");
+ }
+
+ [Conditional("DEBUG")]
+ private static void VerifyShuffle3SpanInput(ReadOnlySpan source, Span dest)
+ where T : struct
+ {
+ DebugGuard.IsTrue(
+ source.Length == dest.Length,
+ nameof(source),
+ "Input spans must be of same length!");
+
+ DebugGuard.IsTrue(
+ source.Length % 3 == 0,
+ nameof(source),
+ "Input spans must be divisable by 3!");
+ }
+
+ [Conditional("DEBUG")]
+ private static void VerifyPad3Shuffle4SpanInput(ReadOnlySpan source, Span dest)
+ {
+ DebugGuard.IsTrue(
+ source.Length % 3 == 0,
+ nameof(source),
+ "Input span must be divisable by 3!");
+
+ DebugGuard.IsTrue(
+ dest.Length % 4 == 0,
+ nameof(dest),
+ "Output span must be divisable by 4!");
+
+ DebugGuard.IsTrue(
+ source.Length == dest.Length * 3 / 4,
+ nameof(source),
+ "Input span must be 3/4 the length of the output span!");
+ }
+
+ [Conditional("DEBUG")]
+ private static void VerifyShuffle4Slice3SpanInput(ReadOnlySpan source, Span dest)
+ {
+ DebugGuard.IsTrue(
+ source.Length % 4 == 0,
+ nameof(source),
+ "Input span must be divisable by 4!");
+
+ DebugGuard.IsTrue(
+ dest.Length % 3 == 0,
+ nameof(dest),
+ "Output span must be divisable by 3!");
+
+ DebugGuard.IsTrue(
+ dest.Length >= source.Length * 3 / 4,
+ nameof(source),
+ "Output span must be at least 3/4 the length of the input span!");
+ }
+
+ public static class Shuffle
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static byte MmShuffle(byte p3, byte p2, byte p1, byte p0)
+ => (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void MmShuffleSpan(ref Span span, byte control)
+ {
+ InverseMmShuffle(
+ control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0);
+
+ ref byte spanBase = ref MemoryMarshal.GetReference(span);
+
+ for (int i = 0; i < span.Length; i += 4)
+ {
+ Unsafe.Add(ref spanBase, i) = (byte)(p0 + i);
+ Unsafe.Add(ref spanBase, i + 1) = (byte)(p1 + i);
+ Unsafe.Add(ref spanBase, i + 2) = (byte)(p2 + i);
+ Unsafe.Add(ref spanBase, i + 3) = (byte)(p3 + i);
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void InverseMmShuffle(
+ byte control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0)
+ {
+ p3 = control >> 6 & 0x3;
+ p2 = control >> 4 & 0x3;
+ p1 = control >> 2 & 0x3;
+ p0 = control >> 0 & 0x3;
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Image{TPixel}.cs b/src/ImageSharp/Image{TPixel}.cs
index 255193c8ea..83ecc37530 100644
--- a/src/ImageSharp/Image{TPixel}.cs
+++ b/src/ImageSharp/Image{TPixel}.cs
@@ -201,14 +201,14 @@ namespace SixLabors.ImageSharp
public bool TryGetSinglePixelSpan(out Span span)
{
IMemoryGroup mg = this.GetPixelMemoryGroup();
- if (mg.Count > 1)
+ if (mg.Count == 1)
{
- span = default;
- return false;
+ span = mg[0].Span;
+ return true;
}
- span = mg.Single().Span;
- return true;
+ span = default;
+ return false;
}
///
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs
index 0b1292b641..d30616997c 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs
@@ -53,84 +53,112 @@ namespace SixLabors.ImageSharp.PixelFormats
Vector4Converters.RgbaCompatible.ToVector4(configuration, this, sourcePixels, destVectors, modifiers.Remove(PixelConversionModifiers.Scale));
}
///
- public override void ToRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void ToRgba32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromArgb32.ToRgba32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromArgb32.ToRgba32(source, dest);
}
///
- public override void FromRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void FromRgba32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromRgba32.ToArgb32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromRgba32.ToArgb32(source, dest);
}
///
- public override void ToBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void ToBgra32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromArgb32.ToBgra32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromArgb32.ToBgra32(source, dest);
}
///
- public override void FromBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void FromBgra32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromBgra32.ToArgb32(source, dest);
+ }
+ ///
+ public override void ToRgb24(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
+ {
+ Guard.NotNull(configuration, nameof(configuration));
+ Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromBgra32.ToArgb32(sp);
- }
+ ReadOnlySpan