diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 0e093a8347..c8f3997946 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -17,11 +17,11 @@ jobs:
- os: ubuntu-latest
framework: netcoreapp3.1
runtime: -x64
- codecov: false
+ codecov: true
- os: windows-latest
framework: netcoreapp3.1
runtime: -x64
- codecov: true
+ codecov: false
- os: windows-latest
framework: netcoreapp2.1
runtime: -x64
diff --git a/.runsettings b/.runsettings
new file mode 100644
index 0000000000..ca48342bd6
--- /dev/null
+++ b/.runsettings
@@ -0,0 +1,7 @@
+
+
+
+
+ category!=failing
+
+
diff --git a/Directory.Build.props b/Directory.Build.props
index 0f9c5bdde2..bb97810a8f 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -15,6 +15,7 @@
$(MSBuildThisFileDirectory)artifacts/
$(SixLaborsProjectCategory)/$(MSBuildProjectName)
https://github.com/SixLabors/ImageSharp/
+ $(MSBuildThisFileDirectory)/.runsettings
@@ -120,6 +121,7 @@
https://api.nuget.org/v3/index.json;
https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-eng/nuget/v3/index.json;
+ https://www.myget.org/F/coverlet-dev/api/v3/index.json;
true
$(MSBuildThisFileDirectory)shared-infrastructure/SixLabors.snk
diff --git a/Directory.Build.targets b/Directory.Build.targets
index 4e7ab9e6b7..2a7d25b977 100644
--- a/Directory.Build.targets
+++ b/Directory.Build.targets
@@ -18,22 +18,18 @@
-
+
-
+
-
-
+
diff --git a/ImageSharp.sln b/ImageSharp.sln
index 509dcf96bf..b1d3176ad2 100644
--- a/ImageSharp.sln
+++ b/ImageSharp.sln
@@ -9,6 +9,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
.gitattributes = .gitattributes
.gitignore = .gitignore
.gitmodules = .gitmodules
+ .runsettings = .runsettings
ci-build.ps1 = ci-build.ps1
ci-pack.ps1 = ci-pack.ps1
ci-test.ps1 = ci-test.ps1
diff --git a/src/Directory.Build.targets b/src/Directory.Build.targets
index d1875262d3..9b8be05b56 100644
--- a/src/Directory.Build.targets
+++ b/src/Directory.Build.targets
@@ -21,16 +21,25 @@
-
+
+ $([System.IO.Path]::Combine('$(IntermediateOutputPath)','$(TargetFrameworkMoniker).AssemblyAttributes$(DefaultLanguageSourceExtension)'))
+
+
+
+
+
+
+
+
+ DependsOnTargets="InitializeSourceRootMappedPaths"
+ Returns="@(_LocalTopLevelSourceRoot)"
+ Condition="'$(DeterministicSourcePaths)' == 'true'">
<_LocalTopLevelSourceRoot Include="@(SourceRoot)" Condition="'%(SourceRoot.NestedRoot)' == ''"/>
-
+
false
@@ -62,7 +71,7 @@
-
+
@@ -74,7 +83,7 @@
SkipUnchangedFiles = "true"
DestinationFolder="..\..\" />
-
+
-
+
diff --git a/src/ImageSharp/Common/Helpers/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/IComponentShuffle.cs
new file mode 100644
index 0000000000..e354a57b00
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/IComponentShuffle.cs
@@ -0,0 +1,165 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Buffers.Binary;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ ///
+ /// Defines the contract for methods that allow the shuffling of pixel components.
+ /// Used for shuffling on platforms that do not support Hardware Intrinsics.
+ ///
+ internal interface IComponentShuffle
+ {
+ ///
+ /// Gets the shuffle control.
+ ///
+ byte Control { get; }
+
+ ///
+ /// Shuffle 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ void RunFallbackShuffle(ReadOnlySpan source, Span dest);
+ }
+
+ internal readonly struct DefaultShuffle4 : IComponentShuffle
+ {
+ public DefaultShuffle4(byte p3, byte p2, byte p1, byte p0)
+ : this(SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0))
+ {
+ }
+
+ public DefaultShuffle4(byte control) => this.Control = control;
+
+ public byte Control { get; }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+ SimdUtils.Shuffle.InverseMmShuffle(
+ this.Control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0);
+
+ for (int i = 0; i < source.Length; i += 4)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i);
+ Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
+ Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
+ Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i);
+ }
+ }
+ }
+
+ internal readonly struct WXYZShuffle4 : IComponentShuffle
+ {
+ public byte Control => SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ReadOnlySpan s = MemoryMarshal.Cast(source);
+ Span d = MemoryMarshal.Cast(dest);
+ ref uint sBase = ref MemoryMarshal.GetReference(s);
+ ref uint dBase = ref MemoryMarshal.GetReference(d);
+
+ // The JIT can detect and optimize rotation idioms ROTL (Rotate Left)
+ // and ROTR (Rotate Right) emitting efficient CPU instructions:
+ // https://github.com/dotnet/coreclr/pull/1830
+ for (int i = 0; i < s.Length; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // ROTL(8, packed) = [Z Y X W]
+ Unsafe.Add(ref dBase, i) = (packed << 8) | (packed >> 24);
+ }
+ }
+ }
+
+ internal readonly struct WZYXShuffle4 : IComponentShuffle
+ {
+ public byte Control => SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ReadOnlySpan s = MemoryMarshal.Cast(source);
+ Span d = MemoryMarshal.Cast(dest);
+ ref uint sBase = ref MemoryMarshal.GetReference(s);
+ ref uint dBase = ref MemoryMarshal.GetReference(d);
+
+ for (int i = 0; i < s.Length; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // REVERSE(packedArgb) = [X Y Z W]
+ Unsafe.Add(ref dBase, i) = BinaryPrimitives.ReverseEndianness(packed);
+ }
+ }
+ }
+
+ internal readonly struct YZWXShuffle4 : IComponentShuffle
+ {
+ public byte Control => SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ReadOnlySpan s = MemoryMarshal.Cast(source);
+ Span d = MemoryMarshal.Cast(dest);
+ ref uint sBase = ref MemoryMarshal.GetReference(s);
+ ref uint dBase = ref MemoryMarshal.GetReference(d);
+
+ for (int i = 0; i < s.Length; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // ROTR(8, packedArgb) = [Y Z W X]
+ Unsafe.Add(ref dBase, i) = (packed >> 8) | (packed << 24);
+ }
+ }
+ }
+
+ internal readonly struct ZYXWShuffle4 : IComponentShuffle
+ {
+ public byte Control => SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ReadOnlySpan s = MemoryMarshal.Cast(source);
+ Span d = MemoryMarshal.Cast(dest);
+ ref uint sBase = ref MemoryMarshal.GetReference(s);
+ ref uint dBase = ref MemoryMarshal.GetReference(d);
+
+ for (int i = 0; i < s.Length; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // tmp1 = [W 0 Y 0]
+ // tmp2 = [0 Z 0 X]
+ // tmp3=ROTL(16, tmp2) = [0 X 0 Z]
+ // tmp1 + tmp3 = [W X Y Z]
+ uint tmp1 = packed & 0xFF00FF00;
+ uint tmp2 = packed & 0x00FF00FF;
+ uint tmp3 = (tmp2 << 16) | (tmp2 >> 16);
+
+ Unsafe.Add(ref dBase, i) = tmp1 + tmp3;
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs
index 977432f8bb..d24230fe18 100644
--- a/src/ImageSharp/Common/Helpers/ImageMaths.cs
+++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs
@@ -132,6 +132,12 @@ namespace SixLabors.ImageSharp
return (a / GreatestCommonDivisor(a, b)) * b;
}
+ ///
+ /// Calculates % 2
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static int Modulo2(int x) => x & 1;
+
///
/// Calculates % 4
///
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
deleted file mode 100644
index b56c92dab7..0000000000
--- a/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-#if SUPPORTS_RUNTIME_INTRINSICS
-
-using System;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-
-namespace SixLabors.ImageSharp
-{
- internal static partial class SimdUtils
- {
- public static class Avx2Intrinsics
- {
- private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
-
- ///
- /// as many elements as possible, slicing them down (keeping the remainder).
- ///
- [MethodImpl(InliningOptions.ShortMethod)]
- internal static void NormalizedFloatToByteSaturateReduce(
- ref ReadOnlySpan source,
- ref Span dest)
- {
- DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
- if (Avx2.IsSupported)
- {
- int remainder = ImageMaths.ModuloP2(source.Length, Vector.Count);
- int adjustedCount = source.Length - remainder;
-
- if (adjustedCount > 0)
- {
- NormalizedFloatToByteSaturate(
- source.Slice(0, adjustedCount),
- dest.Slice(0, adjustedCount));
-
- source = source.Slice(adjustedCount);
- dest = dest.Slice(adjustedCount);
- }
- }
- }
-
- ///
- /// Implementation of , which is faster on new .NET runtime.
- ///
- ///
- /// Implementation is based on MagicScaler code:
- /// https://github.com/saucecontrol/PhotoSauce/blob/a9bd6e5162d2160419f0cf743fd4f536c079170b/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L453-L477
- ///
- internal static void NormalizedFloatToByteSaturate(
- ReadOnlySpan source,
- Span dest)
- {
- VerifySpanInput(source, dest, Vector256.Count);
-
- int n = dest.Length / Vector256.Count;
-
- ref Vector256 sourceBase =
- ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
- ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
-
- var maxBytes = Vector256.Create(255f);
- ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
- Vector256 mask = Unsafe.As>(ref maskBase);
-
- for (int i = 0; i < n; i++)
- {
- ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4);
-
- Vector256 f0 = s;
- Vector256 f1 = Unsafe.Add(ref s, 1);
- Vector256 f2 = Unsafe.Add(ref s, 2);
- Vector256 f3 = Unsafe.Add(ref s, 3);
-
- Vector256 w0 = ConvertToInt32(f0, maxBytes);
- Vector256 w1 = ConvertToInt32(f1, maxBytes);
- Vector256 w2 = ConvertToInt32(f2, maxBytes);
- Vector256 w3 = ConvertToInt32(f3, maxBytes);
-
- Vector256 u0 = Avx2.PackSignedSaturate(w0, w1);
- Vector256 u1 = Avx2.PackSignedSaturate(w2, w3);
- Vector256 b = Avx2.PackUnsignedSaturate(u0, u1);
- b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
-
- Unsafe.Add(ref destBase, i) = b;
- }
- }
-
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static Vector256 ConvertToInt32(Vector256 vf, Vector256 scale)
- {
- vf = Avx.Multiply(vf, scale);
- return Avx.ConvertToVector256Int32(vf);
- }
- }
- }
-}
-#endif
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
new file mode 100644
index 0000000000..782328eddf
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -0,0 +1,529 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp
+{
+ internal static partial class SimdUtils
+ {
+ public static class HwIntrinsics
+ {
+ public static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
+
+ public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 };
+
+ ///
+ /// Shuffle single-precision (32-bit) floating-point elements in
+ /// using the control and store the results in .
+ ///
+ /// The source span of floats.
+ /// The destination span of floats.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4ChannelReduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Avx.IsSupported || Sse.IsSupported)
+ {
+ int remainder = Avx.IsSupported
+ ? ImageMaths.ModuloP2(source.Length, Vector256.Count)
+ : ImageMaths.ModuloP2(source.Length, Vector128.Count);
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ Shuffle4Channel(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount),
+ control);
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Shuffle 8-bit integers in a within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4ChannelReduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Avx2.IsSupported || Ssse3.IsSupported)
+ {
+ int remainder = Avx2.IsSupported
+ ? ImageMaths.ModuloP2(source.Length, Vector256.Count)
+ : ImageMaths.ModuloP2(source.Length, Vector128.Count);
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ Shuffle4Channel(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount),
+ control);
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Avx.IsSupported)
+ {
+ ref Vector256 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector256.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i);
+ ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
+
+ vd0 = Avx.Permute(vs0, control);
+ Unsafe.Add(ref vd0, 1) = Avx.Permute(Unsafe.Add(ref vs0, 1), control);
+ Unsafe.Add(ref vd0, 2) = Avx.Permute(Unsafe.Add(ref vs0, 2), control);
+ Unsafe.Add(ref vd0, 3) = Avx.Permute(Unsafe.Add(ref vs0, 3), control);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control);
+ }
+ }
+ }
+ else
+ {
+ // Sse
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector128.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i);
+ ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i);
+
+ vd0 = Sse.Shuffle(vs0, vs0, control);
+
+ Vector128 vs1 = Unsafe.Add(ref vs0, 1);
+ Unsafe.Add(ref vd0, 1) = Sse.Shuffle(vs1, vs1, control);
+
+ Vector128 vs2 = Unsafe.Add(ref vs0, 2);
+ Unsafe.Add(ref vd0, 2) = Sse.Shuffle(vs2, vs2, control);
+
+ Vector128 vs3 = Unsafe.Add(ref vs0, 3);
+ Unsafe.Add(ref vd0, 3) = Sse.Shuffle(vs3, vs3, control);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Vector128 vs = Unsafe.Add(ref sourceBase, i);
+ Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control);
+ }
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Avx2.IsSupported)
+ {
+ // I've chosen to do this for convenience while we determine what
+ // shuffle controls to add to the library.
+ // We can add static ROS instances if need be in the future.
+ Span bytes = stackalloc byte[Vector256.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector256 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector256 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector256.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i);
+
+ vd0 = Avx2.Shuffle(vs0, vcm);
+ Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vcm);
+ Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vcm);
+ Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vcm);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm);
+ }
+ }
+ }
+ else
+ {
+ // Ssse3
+ Span bytes = stackalloc byte[Vector128.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector128.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i);
+
+ vd0 = Ssse3.Shuffle(vs0, vcm);
+ Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vcm);
+ Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vcm);
+ Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vcm);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vcm);
+ }
+ }
+ }
+ }
+
+ ///
+ /// Performs a multiplication and an addition of the .
+ ///
+ /// The vector to add to the intermediate result.
+ /// The first vector to multiply.
+ /// The second vector to multiply.
+ /// The .
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static Vector256 MultiplyAdd(
+ in Vector256 va,
+ in Vector256 vm0,
+ in Vector256 vm1)
+ {
+ if (Fma.IsSupported)
+ {
+ return Fma.MultiplyAdd(vm1, vm0, va);
+ }
+ else
+ {
+ return Avx.Add(Avx.Multiply(vm0, vm1), va);
+ }
+ }
+
+ ///
+ /// as many elements as possible, slicing them down (keeping the remainder).
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ internal static void ByteToNormalizedFloatReduce(
+ ref ReadOnlySpan source,
+ ref Span dest)
+ {
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+ if (Avx2.IsSupported || Sse2.IsSupported)
+ {
+ int remainder;
+ if (Avx2.IsSupported)
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count);
+ }
+ else
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count);
+ }
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Implementation , which is faster on new RyuJIT runtime.
+ ///
+ ///
+ /// Implementation is based on MagicScaler code:
+ /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182
+ ///
+ internal static unsafe void ByteToNormalizedFloat(
+ ReadOnlySpan source,
+ Span dest)
+ {
+ if (Avx2.IsSupported)
+ {
+ VerifySpanInput(source, dest, Vector256.Count);
+
+ int n = dest.Length / Vector256.Count;
+
+ byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ var scale = Vector256.Create(1 / (float)byte.MaxValue);
+
+ for (int i = 0; i < n; i++)
+ {
+ int si = Vector256.Count * i;
+ Vector256 i0 = Avx2.ConvertToVector256Int32(sourceBase + si);
+ Vector256 i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256.Count);
+ Vector256 i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 2));
+ Vector256 i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 3));
+
+ Vector256 f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));
+ Vector256 f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));
+ Vector256 f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));
+ Vector256 f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));
+
+ ref Vector256 d = ref Unsafe.Add(ref destBase, i * 4);
+
+ d = f0;
+ Unsafe.Add(ref d, 1) = f1;
+ Unsafe.Add(ref d, 2) = f2;
+ Unsafe.Add(ref d, 3) = f3;
+ }
+ }
+ else
+ {
+ // Sse
+ VerifySpanInput(source, dest, Vector128.Count);
+
+ int n = dest.Length / Vector128.Count;
+
+ byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ var scale = Vector128.Create(1 / (float)byte.MaxValue);
+ Vector128 zero = Vector128.Zero;
+
+ for (int i = 0; i < n; i++)
+ {
+ int si = Vector128.Count * i;
+
+ Vector128 i0, i1, i2, i3;
+ if (Sse41.IsSupported)
+ {
+ i0 = Sse41.ConvertToVector128Int32(sourceBase + si);
+ i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128.Count);
+ i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 2));
+ i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 3));
+ }
+ else
+ {
+ Vector128 b = Sse2.LoadVector128(sourceBase + si);
+ Vector128 s0 = Sse2.UnpackLow(b, zero).AsInt16();
+ Vector128 s1 = Sse2.UnpackHigh(b, zero).AsInt16();
+
+ i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32();
+ i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32();
+ i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32();
+ i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32();
+ }
+
+ Vector128 f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0));
+ Vector128 f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1));
+ Vector128 f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2));
+ Vector128 f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3));
+
+ ref Vector128 d = ref Unsafe.Add(ref destBase, i * 4);
+
+ d = f0;
+ Unsafe.Add(ref d, 1) = f1;
+ Unsafe.Add(ref d, 2) = f2;
+ Unsafe.Add(ref d, 3) = f3;
+ }
+ }
+ }
+
+ ///
+ /// as many elements as possible, slicing them down (keeping the remainder).
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ internal static void NormalizedFloatToByteSaturateReduce(
+ ref ReadOnlySpan source,
+ ref Span dest)
+ {
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+ if (Avx2.IsSupported || Sse2.IsSupported)
+ {
+ int remainder;
+ if (Avx2.IsSupported)
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count);
+ }
+ else
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count);
+ }
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ NormalizedFloatToByteSaturate(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount));
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Implementation of , which is faster on new .NET runtime.
+ ///
+ ///
+ /// Implementation is based on MagicScaler code:
+ /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622
+ ///
+ internal static void NormalizedFloatToByteSaturate(
+ ReadOnlySpan source,
+ Span dest)
+ {
+ if (Avx2.IsSupported)
+ {
+ VerifySpanInput(source, dest, Vector256.Count);
+
+ int n = dest.Length / Vector256.Count;
+
+ ref Vector256 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ var scale = Vector256.Create((float)byte.MaxValue);
+ ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
+ Vector256 mask = Unsafe.As>(ref maskBase);
+
+ for (int i = 0; i < n; i++)
+ {
+ ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4);
+
+ Vector256 f0 = Avx.Multiply(scale, s);
+ Vector256 f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1));
+ Vector256 f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2));
+ Vector256 f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3));
+
+ Vector256 w0 = Avx.ConvertToVector256Int32(f0);
+ Vector256 w1 = Avx.ConvertToVector256Int32(f1);
+ Vector256 w2 = Avx.ConvertToVector256Int32(f2);
+ Vector256 w3 = Avx.ConvertToVector256Int32(f3);
+
+ Vector256 u0 = Avx2.PackSignedSaturate(w0, w1);
+ Vector256 u1 = Avx2.PackSignedSaturate(w2, w3);
+ Vector256 b = Avx2.PackUnsignedSaturate(u0, u1);
+ b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
+
+ Unsafe.Add(ref destBase, i) = b;
+ }
+ }
+ else
+ {
+ // Sse
+ VerifySpanInput(source, dest, Vector128.Count);
+
+ int n = dest.Length / Vector128.Count;
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ var scale = Vector128.Create((float)byte.MaxValue);
+
+ for (int i = 0; i < n; i++)
+ {
+ ref Vector128 s = ref Unsafe.Add(ref sourceBase, i * 4);
+
+ Vector128 f0 = Sse.Multiply(scale, s);
+ Vector128 f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1));
+ Vector128 f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2));
+ Vector128 f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3));
+
+ Vector128 w0 = Sse2.ConvertToVector128Int32(f0);
+ Vector128 w1 = Sse2.ConvertToVector128Int32(f1);
+ Vector128 w2 = Sse2.ConvertToVector128Int32(f2);
+ Vector128 w3 = Sse2.ConvertToVector128Int32(f3);
+
+ Vector128 u0 = Sse2.PackSignedSaturate(w0, w1);
+ Vector128 u1 = Sse2.PackSignedSaturate(w2, w3);
+
+ Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1);
+ }
+ }
+ }
+ }
+ }
+}
+#endif
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
new file mode 100644
index 0000000000..a4a40fb4fa
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
@@ -0,0 +1,141 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ internal static partial class SimdUtils
+ {
+ ///
+ /// Shuffle single-precision (32-bit) floating-point elements in
+ /// using the control and store the results in .
+ ///
+ /// The source span of floats.
+ /// The destination span of floats.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ VerifyShuffleSpanInput(source, dest);
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control);
+#endif
+
+ // Deal with the remainder:
+ if (source.Length > 0)
+ {
+ ShuffleRemainder4Channel(source, dest, control);
+ }
+ }
+
+ ///
+ /// Shuffle 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The type of shuffle to perform.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ TShuffle shuffle)
+ where TShuffle : struct, IComponentShuffle
+ {
+ VerifyShuffleSpanInput(source, dest);
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, shuffle.Control);
+#endif
+
+ // Deal with the remainder:
+ if (source.Length > 0)
+ {
+ shuffle.RunFallbackShuffle(source, dest);
+ }
+ }
+
+ public static void ShuffleRemainder4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ ref float sBase = ref MemoryMarshal.GetReference(source);
+ ref float dBase = ref MemoryMarshal.GetReference(dest);
+ Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0);
+
+ for (int i = 0; i < source.Length; i += 4)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i);
+ Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
+ Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
+ Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i);
+ }
+ }
+
+ [Conditional("DEBUG")]
+ private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest)
+ where T : struct
+ {
+ DebugGuard.IsTrue(
+ source.Length == dest.Length,
+ nameof(source),
+ "Input spans must be of same length!");
+
+ DebugGuard.IsTrue(
+ source.Length % 4 == 0,
+ nameof(source),
+ "Input spans must be divisiable by 4!");
+ }
+
+ public static class Shuffle
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static byte MmShuffle(byte p3, byte p2, byte p1, byte p0)
+ => (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void MmShuffleSpan(ref Span span, byte control)
+ {
+ InverseMmShuffle(
+ control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0);
+
+ ref byte spanBase = ref MemoryMarshal.GetReference(span);
+
+ for (int i = 0; i < span.Length; i += 4)
+ {
+ Unsafe.Add(ref spanBase, i) = (byte)(p0 + i);
+ Unsafe.Add(ref spanBase, i + 1) = (byte)(p1 + i);
+ Unsafe.Add(ref spanBase, i + 2) = (byte)(p2 + i);
+ Unsafe.Add(ref spanBase, i + 3) = (byte)(p3 + i);
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void InverseMmShuffle(
+ byte control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0)
+ {
+ p3 = control >> 6 & 0x3;
+ p2 = control >> 4 & 0x3;
+ p1 = control >> 2 & 0x3;
+ p0 = control >> 0 & 0x3;
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs
index 7f917648dc..df533cedf1 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@@ -79,8 +79,9 @@ namespace SixLabors.ImageSharp
internal static void ByteToNormalizedFloat(ReadOnlySpan source, Span dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
-#if SUPPORTS_EXTENDED_INTRINSICS
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest);
+#elif SUPPORTS_EXTENDED_INTRINSICS
ExtendedIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest);
#else
BasicIntrinsics256.ByteToNormalizedFloatReduce(ref source, ref dest);
@@ -110,7 +111,7 @@ namespace SixLabors.ImageSharp
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
#if SUPPORTS_RUNTIME_INTRINSICS
- Avx2Intrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
+ HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
#elif SUPPORTS_EXTENDED_INTRINSICS
ExtendedIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest);
#else
diff --git a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs
index fccc50755d..f617e9a3ea 100644
--- a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs
@@ -5,6 +5,10 @@ using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
namespace SixLabors.ImageSharp
{
@@ -13,6 +17,9 @@ namespace SixLabors.ImageSharp
///
internal static class Vector4Utilities
{
+ private const int BlendAlphaControl = 0b_10_00_10_00;
+ private const int ShuffleAlphaControl = 0b_11_11_11_11;
+
///
/// Restricts a vector between a minimum and a maximum value.
/// 5x Faster then .
@@ -56,13 +63,39 @@ namespace SixLabors.ImageSharp
[MethodImpl(InliningOptions.ShortMethod)]
public static void Premultiply(Span vectors)
{
- // TODO: This method can be AVX2 optimized using Vector
- ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx2.IsSupported && vectors.Length >= 2)
+ {
+ ref Vector256 vectorsBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors));
- for (int i = 0; i < vectors.Length; i++)
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
+
+ while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
+ {
+ Vector256 source = vectorsBase;
+ Vector256 multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
+ vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl);
+ vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
+ }
+
+ if (ImageMaths.Modulo2(vectors.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ Premultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
+ }
+ }
+ else
+#endif
{
- ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
- Premultiply(ref v);
+ ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+
+ for (int i = 0; i < vectors.Length; i++)
+ {
+ ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
+ Premultiply(ref v);
+ }
}
}
@@ -73,13 +106,39 @@ namespace SixLabors.ImageSharp
[MethodImpl(InliningOptions.ShortMethod)]
public static void UnPremultiply(Span vectors)
{
- // TODO: This method can be AVX2 optimized using Vector
- ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx2.IsSupported && vectors.Length >= 2)
+ {
+ ref Vector256 vectorsBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors));
- for (int i = 0; i < vectors.Length; i++)
+ // Divide by 2 as 4 elements per Vector4 and 8 per Vector256
+ ref Vector256 vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
+
+ while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
+ {
+ Vector256 source = vectorsBase;
+ Vector256 multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
+ vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl);
+ vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
+ }
+
+ if (ImageMaths.Modulo2(vectors.Length) != 0)
+ {
+ // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+ UnPremultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
+ }
+ }
+ else
+#endif
{
- ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
- UnPremultiply(ref v);
+ ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+
+ for (int i = 0; i < vectors.Length; i++)
+ {
+ ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
+ UnPremultiply(ref v);
+ }
}
}
diff --git a/src/ImageSharp/Formats/Bmp/BmpEncoderCore.cs b/src/ImageSharp/Formats/Bmp/BmpEncoderCore.cs
index eb29c44050..454440f634 100644
--- a/src/ImageSharp/Formats/Bmp/BmpEncoderCore.cs
+++ b/src/ImageSharp/Formats/Bmp/BmpEncoderCore.cs
@@ -171,7 +171,7 @@ namespace SixLabors.ImageSharp.Formats.Bmp
var fileHeader = new BmpFileHeader(
type: BmpConstants.TypeMarkers.Bitmap,
- fileSize: BmpFileHeader.Size + infoHeaderSize + infoHeader.ImageSize,
+ fileSize: BmpFileHeader.Size + infoHeaderSize + colorPaletteSize + infoHeader.ImageSize,
reserved: 0,
offset: BmpFileHeader.Size + infoHeaderSize + colorPaletteSize);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs
index f6f5903684..0efefc06b5 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs
@@ -10,90 +10,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
internal partial struct Block8x8F
{
- ///
- /// Transpose the block into the destination block.
- ///
- /// The destination block
- [MethodImpl(InliningOptions.ShortMethod)]
- public void TransposeInto(ref Block8x8F d)
- {
- d.V0L.X = V0L.X;
- d.V1L.X = V0L.Y;
- d.V2L.X = V0L.Z;
- d.V3L.X = V0L.W;
- d.V4L.X = V0R.X;
- d.V5L.X = V0R.Y;
- d.V6L.X = V0R.Z;
- d.V7L.X = V0R.W;
-
- d.V0L.Y = V1L.X;
- d.V1L.Y = V1L.Y;
- d.V2L.Y = V1L.Z;
- d.V3L.Y = V1L.W;
- d.V4L.Y = V1R.X;
- d.V5L.Y = V1R.Y;
- d.V6L.Y = V1R.Z;
- d.V7L.Y = V1R.W;
-
- d.V0L.Z = V2L.X;
- d.V1L.Z = V2L.Y;
- d.V2L.Z = V2L.Z;
- d.V3L.Z = V2L.W;
- d.V4L.Z = V2R.X;
- d.V5L.Z = V2R.Y;
- d.V6L.Z = V2R.Z;
- d.V7L.Z = V2R.W;
-
- d.V0L.W = V3L.X;
- d.V1L.W = V3L.Y;
- d.V2L.W = V3L.Z;
- d.V3L.W = V3L.W;
- d.V4L.W = V3R.X;
- d.V5L.W = V3R.Y;
- d.V6L.W = V3R.Z;
- d.V7L.W = V3R.W;
-
- d.V0R.X = V4L.X;
- d.V1R.X = V4L.Y;
- d.V2R.X = V4L.Z;
- d.V3R.X = V4L.W;
- d.V4R.X = V4R.X;
- d.V5R.X = V4R.Y;
- d.V6R.X = V4R.Z;
- d.V7R.X = V4R.W;
-
- d.V0R.Y = V5L.X;
- d.V1R.Y = V5L.Y;
- d.V2R.Y = V5L.Z;
- d.V3R.Y = V5L.W;
- d.V4R.Y = V5R.X;
- d.V5R.Y = V5R.Y;
- d.V6R.Y = V5R.Z;
- d.V7R.Y = V5R.W;
-
- d.V0R.Z = V6L.X;
- d.V1R.Z = V6L.Y;
- d.V2R.Z = V6L.Z;
- d.V3R.Z = V6L.W;
- d.V4R.Z = V6R.X;
- d.V5R.Z = V6R.Y;
- d.V6R.Z = V6R.Z;
- d.V7R.Z = V6R.W;
-
- d.V0R.W = V7L.X;
- d.V1R.W = V7L.Y;
- d.V2R.W = V7L.Z;
- d.V3R.W = V7L.W;
- d.V4R.W = V7R.X;
- d.V5R.W = V7R.Y;
- d.V6R.W = V7R.Z;
- d.V7R.W = V7R.W;
- }
-
///
/// Level shift by +maximum/2, clip to [0, maximum]
///
- public void NormalizeColorsInplace(float maximum)
+ public void NormalizeColorsInPlace(float maximum)
{
var CMin4 = new Vector4(0F);
var CMax4 = new Vector4(maximum);
@@ -118,10 +38,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
}
///
- /// AVX2-only variant for executing and in one step.
+ /// AVX2-only variant for executing and in one step.
///
[MethodImpl(InliningOptions.ShortMethod)]
- public void NormalizeColorsAndRoundInplaceVector8(float maximum)
+ public void NormalizeColorsAndRoundInPlaceVector8(float maximum)
{
var off = new Vector(MathF.Ceiling(maximum / 2));
var max = new Vector(maximum);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt
index 6ee0540213..e5a62dc075 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt
@@ -23,42 +23,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
internal partial struct Block8x8F
{
- ///
- /// Transpose the block into the destination block.
- ///
- /// The destination block
- [MethodImpl(InliningOptions.ShortMethod)]
- public void TransposeInto(ref Block8x8F d)
- {
- <#
- PushIndent(" ");
-
- for (int i = 0; i < 8; i++)
- {
- char destCoord = coordz[i % 4];
- char destSide = (i / 4) % 2 == 0 ? 'L' : 'R';
-
- for (int j = 0; j < 8; j++)
- {
- if(i > 0 && j == 0){
- WriteLine("");
- }
-
- char srcCoord = coordz[j % 4];
- char srcSide = (j / 4) % 2 == 0 ? 'L' : 'R';
-
- var expression = $"d.V{j}{destSide}.{destCoord} = V{i}{srcSide}.{srcCoord};\r\n";
- Write(expression);
- }
- }
- PopIndent();
- #>
- }
-
///
/// Level shift by +maximum/2, clip to [0, maximum]
///
- public void NormalizeColorsInplace(float maximum)
+ public void NormalizeColorsInPlace(float maximum)
{
var CMin4 = new Vector4(0F);
var CMax4 = new Vector4(maximum);
@@ -81,10 +49,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
}
///
- /// AVX2-only variant for executing and in one step.
+ /// AVX2-only variant for executing and in one step.
///
[MethodImpl(InliningOptions.ShortMethod)]
- public void NormalizeColorsAndRoundInplaceVector8(float maximum)
+ public void NormalizeColorsAndRoundInPlaceVector8(float maximum)
{
var off = new Vector(MathF.Ceiling(maximum / 2));
var max = new Vector(maximum);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index b7835d6706..0dbdadbeb4 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -6,6 +6,10 @@ using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
using System.Text;
// ReSharper disable InconsistentNaming
@@ -277,73 +281,156 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
///
/// The value to multiply by.
[MethodImpl(InliningOptions.ShortMethod)]
- public void MultiplyInplace(float value)
- {
- this.V0L *= value;
- this.V0R *= value;
- this.V1L *= value;
- this.V1R *= value;
- this.V2L *= value;
- this.V2R *= value;
- this.V3L *= value;
- this.V3R *= value;
- this.V4L *= value;
- this.V4R *= value;
- this.V5L *= value;
- this.V5R *= value;
- this.V6L *= value;
- this.V6R *= value;
- this.V7L *= value;
- this.V7R *= value;
+ public void MultiplyInPlace(float value)
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx.IsSupported)
+ {
+ var valueVec = Vector256.Create(value);
+ Unsafe.As>(ref this.V0L) = Avx.Multiply(Unsafe.As>(ref this.V0L), valueVec);
+ Unsafe.As>(ref this.V1L) = Avx.Multiply(Unsafe.As>(ref this.V1L), valueVec);
+ Unsafe.As>(ref this.V2L) = Avx.Multiply(Unsafe.As>(ref this.V2L), valueVec);
+ Unsafe.As>(ref this.V3L) = Avx.Multiply(Unsafe.As>(ref this.V3L), valueVec);
+ Unsafe.As>(ref this.V4L) = Avx.Multiply(Unsafe.As>(ref this.V4L), valueVec);
+ Unsafe.As>(ref this.V5L) = Avx.Multiply(Unsafe.As>(ref this.V5L), valueVec);
+ Unsafe.As>(ref this.V6L) = Avx.Multiply(Unsafe.As>(ref this.V6L), valueVec);
+ Unsafe.As>(ref this.V7L) = Avx.Multiply(Unsafe.As>(ref this.V7L), valueVec);
+ }
+ else
+#endif
+ {
+ var valueVec = new Vector4(value);
+ this.V0L *= valueVec;
+ this.V0R *= valueVec;
+ this.V1L *= valueVec;
+ this.V1R *= valueVec;
+ this.V2L *= valueVec;
+ this.V2R *= valueVec;
+ this.V3L *= valueVec;
+ this.V3R *= valueVec;
+ this.V4L *= valueVec;
+ this.V4R *= valueVec;
+ this.V5L *= valueVec;
+ this.V5R *= valueVec;
+ this.V6L *= valueVec;
+ this.V6R *= valueVec;
+ this.V7L *= valueVec;
+ this.V7R *= valueVec;
+ }
}
///
/// Multiply all elements of the block by the corresponding elements of 'other'.
///
[MethodImpl(InliningOptions.ShortMethod)]
- public void MultiplyInplace(ref Block8x8F other)
- {
- this.V0L *= other.V0L;
- this.V0R *= other.V0R;
- this.V1L *= other.V1L;
- this.V1R *= other.V1R;
- this.V2L *= other.V2L;
- this.V2R *= other.V2R;
- this.V3L *= other.V3L;
- this.V3R *= other.V3R;
- this.V4L *= other.V4L;
- this.V4R *= other.V4R;
- this.V5L *= other.V5L;
- this.V5R *= other.V5R;
- this.V6L *= other.V6L;
- this.V6R *= other.V6R;
- this.V7L *= other.V7L;
- this.V7R *= other.V7R;
+ public unsafe void MultiplyInPlace(ref Block8x8F other)
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx.IsSupported)
+ {
+ Unsafe.As>(ref this.V0L)
+ = Avx.Multiply(
+ Unsafe.As>(ref this.V0L),
+ Unsafe.As>(ref other.V0L));
+
+ Unsafe.As>(ref this.V1L)
+ = Avx.Multiply(
+ Unsafe.As>(ref this.V1L),
+ Unsafe.As>(ref other.V1L));
+
+ Unsafe.As>(ref this.V2L)
+ = Avx.Multiply(
+ Unsafe.As>(ref this.V2L),
+ Unsafe.As>(ref other.V2L));
+
+ Unsafe.As>(ref this.V3L)
+ = Avx.Multiply(
+ Unsafe.As>(ref this.V3L),
+ Unsafe.As>(ref other.V3L));
+
+ Unsafe.As>(ref this.V4L)
+ = Avx.Multiply(
+ Unsafe.As>(ref this.V4L),
+ Unsafe.As>(ref other.V4L));
+
+ Unsafe.As>(ref this.V5L)
+ = Avx.Multiply(
+ Unsafe.As>(ref this.V5L),
+ Unsafe.As>(ref other.V5L));
+
+ Unsafe.As>(ref this.V6L)
+ = Avx.Multiply(
+ Unsafe.As>(ref this.V6L),
+ Unsafe.As>(ref other.V6L));
+
+ Unsafe.As