diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
new file mode 100644
index 0000000000..e4dc1a1d8f
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
@@ -0,0 +1,212 @@
+// Copyright (c) Six Labors and contributors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Tuples;
+
+// ReSharper disable MemberHidesStaticFromOuterClass
+namespace SixLabors.ImageSharp
+{
+ internal static partial class SimdUtils
+ {
+ ///
+ /// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*)
+ ///
+ public static class BasicIntrinsics256
+ {
+ public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture;
+
+ ///
+ /// as much elements as possible, slicing them down (keeping the remainder).
+ ///
+ internal static void BulkConvertByteToNormalizedFloatReduce(
+ ref ReadOnlySpan source,
+ ref Span dest)
+ {
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
+
+ if (IsAvailable)
+ {
+ int remainder = source.Length % 8;
+ int alignedCount = source.Length - remainder;
+
+ if (alignedCount > 0)
+ {
+ BulkConvertByteToNormalizedFloat(
+ source.Slice(0, alignedCount),
+ dest.Slice(0, alignedCount));
+
+ source = source.Slice(alignedCount);
+ dest = dest.Slice(alignedCount);
+ }
+ }
+ }
+
+ ///
+ /// Convert 'source.Length' values normalized into [0..1] from 'source'
+ /// into 'dest' buffer of . The values are scaled up into [0-255] and rounded.
+ /// The implementation is SIMD optimized and works only with `source.Length` divisible by 8/>.
+ /// Based on:
+ ///
+ /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
+ ///
+ ///
+ internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan source, Span dest)
+ {
+ GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
+
+ DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!");
+
+ if (source.Length == 0)
+ {
+ return;
+ }
+
+ ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ int n = source.Length / 8;
+
+ Vector magick = new Vector(32768.0f);
+ Vector scale = new Vector(255f) / new Vector(256f);
+
+ // need to copy to a temporary struct, because
+ // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x)
+ // does not work. TODO: This might be a CoreClr bug, need to ask/report
+ var temp = default(Octet.OfUInt32);
+ ref Vector tempRef = ref Unsafe.As>(ref temp);
+
+ for (int i = 0; i < n; i++)
+ {
+ // union { float f; uint32_t i; } u;
+ // u.f = 32768.0f + x * (255.0f / 256.0f);
+ // return (uint8_t)u.i;
+ Vector x = Unsafe.Add(ref srcBase, i);
+ x = (x * scale) + magick;
+ tempRef = x;
+
+ ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
+ d.LoadFrom(ref temp);
+ }
+ }
+
+ ///
+ /// SIMD optimized implementation for .
+ /// Works only with `dest.Length` divisible by 8.
+ /// Implementation adapted from:
+ /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
+ /// http://stackoverflow.com/a/536278
+ ///
+ internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest)
+ {
+ GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
+
+ DebugGuard.IsTrue((dest.Length % 8) == 0, nameof(source), "dest.Length should be divisable by 8!");
+
+ var bVec = new Vector(256.0f / 255.0f);
+ var magicFloat = new Vector(32768.0f);
+ var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f
+ var mask = new Vector(255);
+
+ ref Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+
+ ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet);
+
+ int n = dest.Length / 8;
+
+ for (int i = 0; i < n; i++)
+ {
+ ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i);
+ ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i);
+ d.LoadFrom(ref s);
+ }
+
+ for (int i = 0; i < n; i++)
+ {
+ ref Vector df = ref Unsafe.Add(ref destBaseAsFloat, i);
+
+ var vi = Vector.AsVectorUInt32(df);
+ vi &= mask;
+ vi |= magicInt;
+
+ var vf = Vector.AsVectorSingle(vi);
+ vf = (vf - magicFloat) * bVec;
+
+ df = vf;
+ }
+ }
+
+ ///
+ /// as much elements as possible, slicing them down (keeping the remainder).
+ ///
+ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
+ ref ReadOnlySpan source,
+ ref Span dest)
+ {
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
+
+ if (IsAvailable)
+ {
+ int remainder = source.Length % Vector.Count;
+ int alignedCount = source.Length - remainder;
+
+ if (alignedCount > 0)
+ {
+ BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
+
+ source = source.Slice(alignedCount);
+ dest = dest.Slice(alignedCount);
+ }
+ }
+ }
+
+ ///
+ /// Same as but clamps overflown values before conversion.
+ ///
+ internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest)
+ {
+ GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
+
+ DebugGuard.IsTrue((source.Length % 8) == 0, nameof(source), "source.Length should be divisible by 8!");
+
+ if (source.Length == 0)
+ {
+ return;
+ }
+
+ ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ int n = source.Length / 8;
+
+ Vector magick = new Vector(32768.0f);
+ Vector scale = new Vector(255f) / new Vector(256f);
+
+ // need to copy to a temporary struct, because
+ // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x)
+ // does not work. TODO: This might be a CoreClr bug, need to ask/report
+ var temp = default(Octet.OfUInt32);
+ ref Vector tempRef = ref Unsafe.As>(ref temp);
+
+ for (int i = 0; i < n; i++)
+ {
+ // union { float f; uint32_t i; } u;
+ // u.f = 32768.0f + x * (255.0f / 256.0f);
+ // return (uint8_t)u.i;
+ Vector x = Unsafe.Add(ref srcBase, i);
+ x = Vector.Max(x, Vector.Zero);
+ x = Vector.Min(x, Vector.One);
+
+ x = (x * scale) + magick;
+ tempRef = x;
+
+ ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
+ d.LoadFrom(ref temp);
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
index ec91e50988..5c0b8ee93a 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
@@ -1,8 +1,10 @@
using System;
+using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
+// ReSharper disable MemberHidesStaticFromOuterClass
namespace SixLabors.ImageSharp
{
internal static partial class SimdUtils
@@ -18,22 +20,47 @@ namespace SixLabors.ImageSharp
{
public static bool IsAvailable { get; } =
#if NETCOREAPP2_1
-// TODO: Also available in .NET 4.7.2, we need to add a build target!
- true;
+ // TODO: Also available in .NET 4.7.2, we need to add a build target!
+ Vector.IsHardwareAccelerated;
#else
false;
#endif
///
- /// A variant of , which is faster on new .NET runtime.
+ /// as much elements as possible, slicing them down (keeping the remainder).
+ ///
+ [Conditional("NETCOREAPP2_1")]
+ internal static void BulkConvertByteToNormalizedFloatReduce(
+ ref ReadOnlySpan source,
+ ref Span dest)
+ {
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
+
+ if (IsAvailable)
+ {
+ int remainder = source.Length % Vector.Count;
+ int alignedCount = source.Length - remainder;
+
+ if (alignedCount > 0)
+ {
+ BulkConvertByteToNormalizedFloat(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
+
+ source = source.Slice(alignedCount);
+ dest = dest.Slice(alignedCount);
+ }
+ }
+ }
+
+ ///
+ /// A variant of , which is faster on new RyuJIT runtime.
///
// ReSharper disable once MemberHidesStaticFromOuterClass
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest)
{
- Guard.IsTrue(
+ DebugGuard.IsTrue(
dest.Length % Vector.Count == 0,
nameof(source),
- "dest.Length should be divisable by Vector.Count!");
+ "dest.Length should be divisible by Vector.Count!");
int n = dest.Length / Vector.Count;
@@ -63,34 +90,52 @@ namespace SixLabors.ImageSharp
}
}
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static Vector ConvertToSingle(Vector u, Vector scale)
+ ///
+ /// as much elements as possible, slicing them down (keeping the remainder).
+ ///
+ [Conditional("NETCOREAPP2_1")]
+ internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
+ ref ReadOnlySpan source,
+ ref Span dest)
{
- Vector vi = Vector.AsVectorInt32(u);
- Vector v = Vector.ConvertToSingle(vi);
- v *= scale;
- return v;
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
+
+ if (IsAvailable)
+ {
+ int remainder = source.Length % Vector.Count;
+ int alignedCount = source.Length - remainder;
+
+ if (alignedCount > 0)
+ {
+ BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
+
+ source = source.Slice(alignedCount);
+ dest = dest.Slice(alignedCount);
+ }
+ }
}
///
- /// A variant of , which is faster on new .NET runtime.
+ /// A variant of , which is faster on new .NET runtime.
///
///
/// It does NOT worth yet to utilize this method (2018 Oct).
/// See benchmark results for the "PackFromVector4_Rgba32" benchmark!
/// TODO: Check again later!
///
- // ReSharper disable once MemberHidesStaticFromOuterClass
- internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest)
+ internal static void BulkConvertNormalizedFloatToByteClampOverflows(
+ ReadOnlySpan source,
+ Span dest)
{
- Guard.IsTrue(
+ DebugGuard.IsTrue(
dest.Length % Vector.Count == 0,
nameof(dest),
- "dest.Length should be divisable by Vector.Count!");
+ "dest.Length should be divisible by Vector.Count!");
int n = dest.Length / Vector.Count;
- ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref Vector sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
for (int i = 0; i < n; i++)
@@ -126,6 +171,15 @@ namespace SixLabors.ImageSharp
Vector vi = Vector.ConvertToInt32(vf);
return Vector.AsVectorUInt32(vi);
}
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static Vector ConvertToSingle(Vector u, Vector scale)
+ {
+ Vector vi = Vector.AsVectorInt32(u);
+ Vector v = Vector.ConvertToSingle(vi);
+ v *= scale;
+ return v;
+ }
}
}
-}
+}
\ No newline at end of file
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs
index 91aed8c79a..73e9bacfa8 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@@ -6,6 +6,9 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.PixelFormats;
+using SixLabors.ImageSharp.Tuples;
+
namespace SixLabors.ImageSharp
{
///
@@ -16,7 +19,8 @@ namespace SixLabors.ImageSharp
///
/// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte.
///
- public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8;
+ public static bool IsAvx2CompatibleArchitecture { get; } =
+ Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8;
internal static void GuardAvx2(string operation)
{
@@ -57,236 +61,61 @@ namespace SixLabors.ImageSharp
}
///
- /// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of values.
- /// The values are scaled up into [0-255] and rounded.
- /// The implementation is SIMD optimized and works only with `source.Length` divisible by .
- /// Based on:
- ///
- /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
- ///
- ///
- internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan source, Span dest)
- {
- GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
-
- DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!");
-
- if (source.Length == 0)
- {
- return;
- }
-
- ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
- ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
- int n = source.Length / 8;
-
- Vector magick = new Vector(32768.0f);
- Vector scale = new Vector(255f) / new Vector(256f);
-
- // need to copy to a temporary struct, because
- // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x)
- // does not work. TODO: This might be a CoreClr bug, need to ask/report
- var temp = default(Octet.OfUInt32);
- ref Vector tempRef = ref Unsafe.As>(ref temp);
-
- for (int i = 0; i < n; i++)
- {
- // union { float f; uint32_t i; } u;
- // u.f = 32768.0f + x * (255.0f / 256.0f);
- // return (uint8_t)u.i;
- Vector x = Unsafe.Add(ref srcBase, i);
- x = (x * scale) + magick;
- tempRef = x;
-
- ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
- d.LoadFrom(ref temp);
- }
- }
-
- ///
- /// Converts `dest.Length` bytes to -s to -s normalized into [0..1]
- /// The implementation is SIMD optimized and works only with `dest.Length` divisible by .
- /// Implementation adapted from:
- ///
- /// http://stackoverflow.com/a/5362789
- ///
+ /// Converts `dest.Length` -s to -s normalized into [0..1].
+ /// should be the of the same size as ,
+ /// but there are no restrictions on the span's length.
///
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest)
{
- GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
-
- DebugGuard.IsTrue((dest.Length % Vector.Count) == 0, nameof(source), "dest.Length should be divisable by Vector.Count!");
-
- var bVec = new Vector(256.0f / 255.0f);
- var magicFloat = new Vector(32768.0f);
- var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f
- var mask = new Vector(255);
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
- ref Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
- ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
-
- ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet);
-
- int n = dest.Length / 8;
-
- for (int i = 0; i < n; i++)
- {
- ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i);
- ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i);
- d.LoadFrom(ref s);
- }
+ ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
+ BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
- for (int i = 0; i < n; i++)
+ // Deal with the remainder:
+ int count = source.Length;
+ if (count > 0)
{
- ref Vector df = ref Unsafe.Add(ref destBaseAsFloat, i);
-
- var vi = Vector.AsVectorUInt32(df);
- vi &= mask;
- vi |= magicInt;
-
- var vf = Vector.AsVectorSingle(vi);
- vf = (vf - magicFloat) * bVec;
-
- df = vf;
+ // TODO: Do we need to optimize anything on this? (There are at most 7 remainders)
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref float dBase = ref MemoryMarshal.GetReference(dest);
+ for (int i = 0; i < count; i++)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, i) / 255f;
+ }
}
}
///
- /// Same as but clamps overflown values before conversion.
+ /// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of .
+ /// The values are scaled up into [0-255] and rounded, overflows are clamped.
+ /// should be the of the same size as ,
+ /// but there are no restrictions on the span's length.
///
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest)
{
- GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
-
- DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!");
-
- if (source.Length == 0)
- {
- return;
- }
-
- ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
- ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
- int n = source.Length / 8;
-
- Vector magick = new Vector(32768.0f);
- Vector scale = new Vector(255f) / new Vector(256f);
-
- // need to copy to a temporary struct, because
- // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x)
- // does not work. TODO: This might be a CoreClr bug, need to ask/report
- var temp = default(Octet.OfUInt32);
- ref Vector tempRef = ref Unsafe.As>(ref temp);
-
- for (int i = 0; i < n; i++)
- {
- // union { float f; uint32_t i; } u;
- // u.f = 32768.0f + x * (255.0f / 256.0f);
- // return (uint8_t)u.i;
- Vector x = Unsafe.Add(ref srcBase, i);
- x = Vector.Max(x, Vector.Zero);
- x = Vector.Min(x, Vector.One);
-
- x = (x * scale) + magick;
- tempRef = x;
-
- ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
- d.LoadFrom(ref temp);
- }
- }
-
- // TODO: Replace these with T4-d library level tuples!
- internal static class Octet
- {
- [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))]
- public struct OfUInt32
- {
- [FieldOffset(0 * sizeof(uint))]
- public uint V0;
-
- [FieldOffset(1 * sizeof(uint))]
- public uint V1;
-
- [FieldOffset(2 * sizeof(uint))]
- public uint V2;
-
- [FieldOffset(3 * sizeof(uint))]
- public uint V3;
-
- [FieldOffset(4 * sizeof(uint))]
- public uint V4;
-
- [FieldOffset(5 * sizeof(uint))]
- public uint V5;
+ DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
- [FieldOffset(6 * sizeof(uint))]
- public uint V6;
+ ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
+ BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
- [FieldOffset(7 * sizeof(uint))]
- public uint V7;
-
- public override string ToString()
- {
- return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
- }
-
- [MethodImpl(InliningOptions.ShortMethod)]
- public void LoadFrom(ref OfByte src)
- {
- this.V0 = src.V0;
- this.V1 = src.V1;
- this.V2 = src.V2;
- this.V3 = src.V3;
- this.V4 = src.V4;
- this.V5 = src.V5;
- this.V6 = src.V6;
- this.V7 = src.V7;
- }
- }
-
- [StructLayout(LayoutKind.Explicit, Size = 8)]
- public struct OfByte
+ // Deal with the remainder:
+ int count = source.Length;
+ if (count > 0)
{
- [FieldOffset(0)]
- public byte V0;
-
- [FieldOffset(1)]
- public byte V1;
-
- [FieldOffset(2)]
- public byte V2;
-
- [FieldOffset(3)]
- public byte V3;
-
- [FieldOffset(4)]
- public byte V4;
-
- [FieldOffset(5)]
- public byte V5;
-
- [FieldOffset(6)]
- public byte V6;
-
- [FieldOffset(7)]
- public byte V7;
-
- public override string ToString()
- {
- return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
- }
+ ref float sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
- [MethodImpl(InliningOptions.ShortMethod)]
- public void LoadFrom(ref OfUInt32 src)
+ for (int i = 0; i < count; i++)
{
- this.V0 = (byte)src.V0;
- this.V1 = (byte)src.V1;
- this.V2 = (byte)src.V2;
- this.V3 = (byte)src.V3;
- this.V4 = (byte)src.V4;
- this.V5 = (byte)src.V5;
- this.V6 = (byte)src.V6;
- this.V7 = (byte)src.V7;
+ // TODO: Do we need to optimize anything on this? (There are at most 7 remainders)
+ float f = Unsafe.Add(ref sBase, i);
+ f *= 255f;
+ f += 0.5f;
+ f = MathF.Max(0, f);
+ f = MathF.Min(255f, f);
+
+ Unsafe.Add(ref dBase, i) = (byte)f;
}
}
}
diff --git a/src/ImageSharp/Common/Tuples/Octet.cs b/src/ImageSharp/Common/Tuples/Octet.cs
new file mode 100644
index 0000000000..ae01a31217
--- /dev/null
+++ b/src/ImageSharp/Common/Tuples/Octet.cs
@@ -0,0 +1,100 @@
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp.Tuples
+{
+ internal static class Octet
+ {
+ [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))]
+ public struct OfUInt32
+ {
+ [FieldOffset(0 * sizeof(uint))]
+ public uint V0;
+
+ [FieldOffset(1 * sizeof(uint))]
+ public uint V1;
+
+ [FieldOffset(2 * sizeof(uint))]
+ public uint V2;
+
+ [FieldOffset(3 * sizeof(uint))]
+ public uint V3;
+
+ [FieldOffset(4 * sizeof(uint))]
+ public uint V4;
+
+ [FieldOffset(5 * sizeof(uint))]
+ public uint V5;
+
+ [FieldOffset(6 * sizeof(uint))]
+ public uint V6;
+
+ [FieldOffset(7 * sizeof(uint))]
+ public uint V7;
+
+ public override string ToString()
+ {
+ return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void LoadFrom(ref OfByte src)
+ {
+ this.V0 = src.V0;
+ this.V1 = src.V1;
+ this.V2 = src.V2;
+ this.V3 = src.V3;
+ this.V4 = src.V4;
+ this.V5 = src.V5;
+ this.V6 = src.V6;
+ this.V7 = src.V7;
+ }
+ }
+
+ [StructLayout(LayoutKind.Explicit, Size = 8)]
+ public struct OfByte
+ {
+ [FieldOffset(0)]
+ public byte V0;
+
+ [FieldOffset(1)]
+ public byte V1;
+
+ [FieldOffset(2)]
+ public byte V2;
+
+ [FieldOffset(3)]
+ public byte V3;
+
+ [FieldOffset(4)]
+ public byte V4;
+
+ [FieldOffset(5)]
+ public byte V5;
+
+ [FieldOffset(6)]
+ public byte V6;
+
+ [FieldOffset(7)]
+ public byte V7;
+
+ public override string ToString()
+ {
+ return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void LoadFrom(ref OfUInt32 src)
+ {
+ this.V0 = (byte)src.V0;
+ this.V1 = (byte)src.V1;
+ this.V2 = (byte)src.V2;
+ this.V3 = (byte)src.V3;
+ this.V4 = (byte)src.V4;
+ this.V5 = (byte)src.V5;
+ this.V6 = (byte)src.V6;
+ this.V7 = (byte)src.V7;
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/ImageSharp/Common/Tuples/Vector4Pair.cs b/src/ImageSharp/Common/Tuples/Vector4Pair.cs
index 309d5e2e56..5988b2200b 100644
--- a/src/ImageSharp/Common/Tuples/Vector4Pair.cs
+++ b/src/ImageSharp/Common/Tuples/Vector4Pair.cs
@@ -2,7 +2,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
-namespace SixLabors.ImageSharp.Common.Tuples
+namespace SixLabors.ImageSharp.Tuples
{
///
/// Its faster to process multiple Vector4-s together, so let's pair them!
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs
index 4b2626c582..5c63a478db 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs
@@ -6,7 +6,7 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
-using SixLabors.ImageSharp.Common.Tuples;
+using SixLabors.ImageSharp.Tuples;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
{
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
index ab4947e65c..3f26cdc907 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
@@ -6,7 +6,7 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
-using SixLabors.ImageSharp.Common.Tuples;
+using SixLabors.ImageSharp.Tuples;
// ReSharper disable ImpureMethodCallOnReadonlyValueField
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
index 60abb7fb2c..293f3bc1f7 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
@@ -6,8 +6,8 @@ using System.Collections.Generic;
using System.Linq;
using System.Numerics;
-using SixLabors.ImageSharp.Common.Tuples;
using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.Tuples;
using SixLabors.Memory;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
index bfef60c606..564b93ef52 100644
--- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
+++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
@@ -37,7 +37,7 @@ namespace SixLabors.ImageSharp.PixelFormats
}
else
{
- ConvertToVector4UsingStandardIntrinsics(sourceColors, destinationVectors, count);
+ ConvertToVector4UsingBasicIntrinsics(sourceColors, destinationVectors, count);
}
}
@@ -58,7 +58,7 @@ namespace SixLabors.ImageSharp.PixelFormats
}
else
{
- ConvertFromVector4StandardIntrinsics(sourceVectors, destinationColors, count);
+ ConvertFromVector4BasicIntrinsics(sourceVectors, destinationColors, count);
}
}
@@ -112,7 +112,7 @@ namespace SixLabors.ImageSharp.PixelFormats
}
}
- private static void ConvertToVector4UsingStandardIntrinsics(
+ private static void ConvertToVector4UsingBasicIntrinsics(
ReadOnlySpan sourceColors,
Span destinationVectors,
int count)
@@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.PixelFormats
ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors);
Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount));
- SimdUtils.BulkConvertByteToNormalizedFloat(rawSrc, rawDest);
+ SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(rawSrc, rawDest);
}
if (remainder > 0)
@@ -155,7 +155,7 @@ namespace SixLabors.ImageSharp.PixelFormats
}
}
- private static void ConvertFromVector4StandardIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count)
+ private static void ConvertFromVector4BasicIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count)
{
int remainder = count % 2;
int alignedCount = count - remainder;
@@ -165,7 +165,7 @@ namespace SixLabors.ImageSharp.PixelFormats
ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount));
Span rawDest = MemoryMarshal.Cast(destinationColors);
- SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
+ SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
}
if (remainder > 0)
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
index 726e214a96..855e9e4b97 100644
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
@@ -30,8 +30,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
[Params(
//64,
//256,
- //512,
- 2048
+ 512
+ //1024
)]
public int Count { get; set; }
@@ -117,7 +117,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats);
}
- //[Benchmark]
+ [Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops()
{
Span sBytes = MemoryMarshal.Cast(this.source.GetSpan());
@@ -159,7 +159,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
}
}
- //[Benchmark]
+ [Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop()
{
Span sBytes = MemoryMarshal.Cast(this.source.GetSpan());
diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs
index be19e719a8..ca85a350cc 100644
--- a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs
+++ b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs
@@ -5,6 +5,7 @@ using BenchmarkDotNet.Attributes;
namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
{
+ [Config(typeof(Config.ShortClr))]
public class UInt32ToSingle
{
private float[] data;
@@ -66,8 +67,7 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
Unsafe.Add(ref bf, i) = v;
}
}
-
- // This code is not correct at all, it's just here as reference
+
[Benchmark]
public void StandardSimdFromInt()
{
@@ -86,5 +86,28 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
Unsafe.Add(ref bf, i) = v;
}
}
+
+
+ [Benchmark]
+ public void StandardSimdFromInt_RefCast()
+ {
+ int n = Count / Vector.Count;
+
+ ref Vector bf = ref Unsafe.As>(ref this.data[0]);
+ ref Vector bu = ref Unsafe.As, Vector>(ref bf);
+
+ var scale = new Vector(1f / 255f);
+
+ for (int i = 0; i < n; i++)
+ {
+ ref Vector fRef = ref Unsafe.Add(ref bf, i);
+
+ Vector du = Vector.AsVectorInt32(fRef);
+ Vector v = Vector.ConvertToSingle(du);
+ v *= scale;
+
+ fRef = v;
+ }
+ }
}
}
\ No newline at end of file
diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs
index f71f6ec1bf..2bc3af4c98 100644
--- a/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs
+++ b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs
@@ -3,8 +3,11 @@ using System.Runtime.CompilerServices;
using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.Tuples;
+
namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
{
+ [Config(typeof(Config.ShortClr))]
public class WidenBytesToUInt32
{
private byte[] source;
@@ -25,8 +28,8 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
{
const int N = Count / 8;
- ref SimdUtils.Octet.OfByte sBase = ref Unsafe.As(ref this.source[0]);
- ref SimdUtils.Octet.OfUInt32 dBase = ref Unsafe.As(ref this.dest[0]);
+ ref Octet.OfByte sBase = ref Unsafe.As(ref this.source[0]);
+ ref Octet.OfUInt32 dBase = ref Unsafe.As(ref this.dest[0]);
for (int i = 0; i < N; i++)
{
diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
index 4e1717bda9..2dcba2b74b 100644
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@@ -62,7 +62,7 @@ namespace SixLabors.ImageSharp.Tests.Common
{
float[] data = new float[Vector.Count];
- var rnd = new Random();
+ var rnd = new Random(seed);
for (int i = 0; i < Vector.Count; i++)
{
@@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common
[InlineData(1, 8)]
[InlineData(2, 16)]
[InlineData(3, 128)]
- public void BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
+ public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
{
if (this.SkipOnNonAvx2())
{
@@ -130,7 +130,7 @@ namespace SixLabors.ImageSharp.Tests.Common
byte[] dest = new byte[count];
- SimdUtils.BulkConvertNormalizedFloatToByte(normalized, dest);
+ SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(normalized, dest);
byte[] expected = orig.Select(f => (byte)(f)).ToArray();
@@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common
[InlineData(1, 8)]
[InlineData(2, 16)]
[InlineData(3, 128)]
- public void BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
+ public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
{
if (this.SkipOnNonAvx2())
{
@@ -153,87 +153,113 @@ namespace SixLabors.ImageSharp.Tests.Common
byte[] dest = new byte[count];
- SimdUtils.BulkConvertNormalizedFloatToByte(source, dest);
+ SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(source, dest);
byte[] expected = source.Select(f => (byte)Math.Round(f * 255f)).ToArray();
Assert.Equal(expected, dest);
}
+ public static readonly TheoryData ArraySizesDivisibleBy8 = new TheoryData { 0, 8, 16, 1024 };
+
+ public static readonly TheoryData ArraySizesDivisibleBy32 = new TheoryData { 0, 32, 512 };
+
+ public static readonly TheoryData ArbitraryArraySizes =
+ new TheoryData
+ {
+ 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520,
+ };
[Theory]
- [InlineData(1, 0)]
- [InlineData(2, 32)]
- [InlineData(3, 128)]
- public void BulkConvertByteToNormalizedFloat(int seed, int count)
+ [MemberData(nameof(ArraySizesDivisibleBy8))]
+ public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count)
{
if (this.SkipOnNonAvx2())
{
return;
}
- byte[] source = new Random(seed).GenerateRandomByteArray(count);
- float[] result = new float[count];
- float[] expected = source.Select(b => (float)b / 255f).ToArray();
-
- SimdUtils.BulkConvertByteToNormalizedFloat(source, result);
-
- Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
+ TestImpl_BulkConvertByteToNormalizedFloat(
+ count,
+ (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
}
[Theory]
- [InlineData(1, 0)]
- [InlineData(2, 32)]
- [InlineData(3, 128)]
- public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count)
+ [MemberData(nameof(ArraySizesDivisibleBy32))]
+ public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int count)
+ {
+ TestImpl_BulkConvertByteToNormalizedFloat(
+ count,
+ (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
+ }
+
+ [Theory]
+ [MemberData(nameof(ArbitraryArraySizes))]
+ public void BulkConvertByteToNormalizedFloat(int count)
+ {
+ TestImpl_BulkConvertByteToNormalizedFloat(
+ count,
+ (s, d) => SimdUtils.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
+ }
+
+ private static void TestImpl_BulkConvertByteToNormalizedFloat(
+ int count,
+ Action, Memory> convert)
{
- byte[] source = new Random(seed).GenerateRandomByteArray(count);
+ byte[] source = new Random(count).GenerateRandomByteArray(count);
float[] result = new float[count];
float[] expected = source.Select(b => (float)b / 255f).ToArray();
-
- SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result);
+ convert(source, result);
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
}
-
- public static readonly TheoryData BulkConvertNormalizedFloatToByteClampOverflows_Data =
- new TheoryData
- {
- 0, 64, 1024
- };
-
[Theory]
- [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))]
- public void BulkConvertNormalizedFloatToByteClampOverflows(int count)
+ [MemberData(nameof(ArraySizesDivisibleBy8))]
+ public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
if (this.SkipOnNonAvx2())
{
return;
}
- float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f);
- byte[] expected = source.Select(NormalizedFloatToByte).ToArray();
- byte[] actual = new byte[count];
-
- SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(source, actual);
-
- Assert.Equal(expected, actual);
+ TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
+ (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
+ );
}
[Theory]
- [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))]
+ [MemberData(nameof(ArraySizesDivisibleBy32))]
public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
+ {
+ TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
+ (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
+ );
+ }
+
+ [Theory]
+ [MemberData(nameof(ArbitraryArraySizes))]
+ public void BulkConvertNormalizedFloatToByteClampOverflows(int count)
+ {
+ TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
+ (s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
+ );
+ }
+
+ private static void TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
+ int count,
+ Action, Memory> convert)
{
float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f);
byte[] expected = source.Select(NormalizedFloatToByte).ToArray();
byte[] actual = new byte[count];
- SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(source, actual);
+ convert(source, actual);
Assert.Equal(expected, actual);
}
+
private static byte NormalizedFloatToByte(float f) => (byte)Math.Min(255f, Math.Max(0f, f * 255f + 0.5f));
[Theory]