diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs new file mode 100644 index 0000000000..e4dc1a1d8f --- /dev/null +++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs @@ -0,0 +1,212 @@ +// Copyright (c) Six Labors and contributors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using SixLabors.ImageSharp.Tuples; + +// ReSharper disable MemberHidesStaticFromOuterClass +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + /// + /// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*) + /// + public static class BasicIntrinsics256 + { + public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture; + + /// + /// as much elements as possible, slicing them down (keeping the remainder). + /// + internal static void BulkConvertByteToNormalizedFloatReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + if (IsAvailable) + { + int remainder = source.Length % 8; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertByteToNormalizedFloat( + source.Slice(0, alignedCount), + dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } + } + + /// + /// Convert 'source.Length' values normalized into [0..1] from 'source' + /// into 'dest' buffer of . The values are scaled up into [0-255] and rounded. + /// The implementation is SIMD optimized and works only with `source.Length` divisible by 8/>. + /// Based on: + /// + /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions + /// + /// + internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan source, Span dest) + { + GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); + + DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!"); + + if (source.Length == 0) + { + return; + } + + ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + int n = source.Length / 8; + + Vector magick = new Vector(32768.0f); + Vector scale = new Vector(255f) / new Vector(256f); + + // need to copy to a temporary struct, because + // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x) + // does not work. TODO: This might be a CoreClr bug, need to ask/report + var temp = default(Octet.OfUInt32); + ref Vector tempRef = ref Unsafe.As>(ref temp); + + for (int i = 0; i < n; i++) + { + // union { float f; uint32_t i; } u; + // u.f = 32768.0f + x * (255.0f / 256.0f); + // return (uint8_t)u.i; + Vector x = Unsafe.Add(ref srcBase, i); + x = (x * scale) + magick; + tempRef = x; + + ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); + d.LoadFrom(ref temp); + } + } + + /// + /// SIMD optimized implementation for . + /// Works only with `dest.Length` divisible by 8. + /// Implementation adapted from: + /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions + /// http://stackoverflow.com/a/536278 + /// + internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) + { + GuardAvx2(nameof(BulkConvertByteToNormalizedFloat)); + + DebugGuard.IsTrue((dest.Length % 8) == 0, nameof(source), "dest.Length should be divisable by 8!"); + + var bVec = new Vector(256.0f / 255.0f); + var magicFloat = new Vector(32768.0f); + var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f + var mask = new Vector(255); + + ref Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + + ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); + + int n = dest.Length / 8; + + for (int i = 0; i < n; i++) + { + ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); + ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i); + d.LoadFrom(ref s); + } + + for (int i = 0; i < n; i++) + { + ref Vector df = ref Unsafe.Add(ref destBaseAsFloat, i); + + var vi = Vector.AsVectorUInt32(df); + vi &= mask; + vi |= magicInt; + + var vf = Vector.AsVectorSingle(vi); + vf = (vf - magicFloat) * bVec; + + df = vf; + } + } + + /// + /// as much elements as possible, slicing them down (keeping the remainder). + /// + internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + if (IsAvailable) + { + int remainder = source.Length % Vector.Count; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } + } + + /// + /// Same as but clamps overflown values before conversion. + /// + internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) + { + GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows)); + + DebugGuard.IsTrue((source.Length % 8) == 0, nameof(source), "source.Length should be divisible by 8!"); + + if (source.Length == 0) + { + return; + } + + ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + int n = source.Length / 8; + + Vector magick = new Vector(32768.0f); + Vector scale = new Vector(255f) / new Vector(256f); + + // need to copy to a temporary struct, because + // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x) + // does not work. TODO: This might be a CoreClr bug, need to ask/report + var temp = default(Octet.OfUInt32); + ref Vector tempRef = ref Unsafe.As>(ref temp); + + for (int i = 0; i < n; i++) + { + // union { float f; uint32_t i; } u; + // u.f = 32768.0f + x * (255.0f / 256.0f); + // return (uint8_t)u.i; + Vector x = Unsafe.Add(ref srcBase, i); + x = Vector.Max(x, Vector.Zero); + x = Vector.Min(x, Vector.One); + + x = (x * scale) + magick; + tempRef = x; + + ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); + d.LoadFrom(ref temp); + } + } + } + } +} \ No newline at end of file diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index ec91e50988..5c0b8ee93a 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -1,8 +1,10 @@ using System; +using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +// ReSharper disable MemberHidesStaticFromOuterClass namespace SixLabors.ImageSharp { internal static partial class SimdUtils @@ -18,22 +20,47 @@ namespace SixLabors.ImageSharp { public static bool IsAvailable { get; } = #if NETCOREAPP2_1 -// TODO: Also available in .NET 4.7.2, we need to add a build target! - true; + // TODO: Also available in .NET 4.7.2, we need to add a build target! + Vector.IsHardwareAccelerated; #else false; #endif /// - /// A variant of , which is faster on new .NET runtime. + /// as much elements as possible, slicing them down (keeping the remainder). + /// + [Conditional("NETCOREAPP2_1")] + internal static void BulkConvertByteToNormalizedFloatReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + if (IsAvailable) + { + int remainder = source.Length % Vector.Count; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertByteToNormalizedFloat(source.Slice(0, alignedCount), dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } + } + + /// + /// A variant of , which is faster on new RyuJIT runtime. /// // ReSharper disable once MemberHidesStaticFromOuterClass internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { - Guard.IsTrue( + DebugGuard.IsTrue( dest.Length % Vector.Count == 0, nameof(source), - "dest.Length should be divisable by Vector.Count!"); + "dest.Length should be divisible by Vector.Count!"); int n = dest.Length / Vector.Count; @@ -63,34 +90,52 @@ namespace SixLabors.ImageSharp } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector ConvertToSingle(Vector u, Vector scale) + /// + /// as much elements as possible, slicing them down (keeping the remainder). + /// + [Conditional("NETCOREAPP2_1")] + internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( + ref ReadOnlySpan source, + ref Span dest) { - Vector vi = Vector.AsVectorInt32(u); - Vector v = Vector.ConvertToSingle(vi); - v *= scale; - return v; + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + if (IsAvailable) + { + int remainder = source.Length % Vector.Count; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } } /// - /// A variant of , which is faster on new .NET runtime. + /// A variant of , which is faster on new .NET runtime. /// /// /// It does NOT worth yet to utilize this method (2018 Oct). /// See benchmark results for the "PackFromVector4_Rgba32" benchmark! /// TODO: Check again later! /// - // ReSharper disable once MemberHidesStaticFromOuterClass - internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) + internal static void BulkConvertNormalizedFloatToByteClampOverflows( + ReadOnlySpan source, + Span dest) { - Guard.IsTrue( + DebugGuard.IsTrue( dest.Length % Vector.Count == 0, nameof(dest), - "dest.Length should be divisable by Vector.Count!"); + "dest.Length should be divisible by Vector.Count!"); int n = dest.Length / Vector.Count; - ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Vector sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); for (int i = 0; i < n; i++) @@ -126,6 +171,15 @@ namespace SixLabors.ImageSharp Vector vi = Vector.ConvertToInt32(vf); return Vector.AsVectorUInt32(vi); } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector ConvertToSingle(Vector u, Vector scale) + { + Vector vi = Vector.AsVectorInt32(u); + Vector v = Vector.ConvertToSingle(vi); + v *= scale; + return v; + } } } -} +} \ No newline at end of file diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index 91aed8c79a..73e9bacfa8 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -6,6 +6,9 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using SixLabors.ImageSharp.PixelFormats; +using SixLabors.ImageSharp.Tuples; + namespace SixLabors.ImageSharp { /// @@ -16,7 +19,8 @@ namespace SixLabors.ImageSharp /// /// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte. /// - public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8; + public static bool IsAvx2CompatibleArchitecture { get; } = + Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8; internal static void GuardAvx2(string operation) { @@ -57,236 +61,61 @@ namespace SixLabors.ImageSharp } /// - /// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of values. - /// The values are scaled up into [0-255] and rounded. - /// The implementation is SIMD optimized and works only with `source.Length` divisible by . - /// Based on: - /// - /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions - /// - /// - internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan source, Span dest) - { - GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); - - DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!"); - - if (source.Length == 0) - { - return; - } - - ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); - ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); - int n = source.Length / 8; - - Vector magick = new Vector(32768.0f); - Vector scale = new Vector(255f) / new Vector(256f); - - // need to copy to a temporary struct, because - // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x) - // does not work. TODO: This might be a CoreClr bug, need to ask/report - var temp = default(Octet.OfUInt32); - ref Vector tempRef = ref Unsafe.As>(ref temp); - - for (int i = 0; i < n; i++) - { - // union { float f; uint32_t i; } u; - // u.f = 32768.0f + x * (255.0f / 256.0f); - // return (uint8_t)u.i; - Vector x = Unsafe.Add(ref srcBase, i); - x = (x * scale) + magick; - tempRef = x; - - ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); - d.LoadFrom(ref temp); - } - } - - /// - /// Converts `dest.Length` bytes to -s to -s normalized into [0..1] - /// The implementation is SIMD optimized and works only with `dest.Length` divisible by . - /// Implementation adapted from: - /// - /// http://stackoverflow.com/a/5362789 - /// + /// Converts `dest.Length` -s to -s normalized into [0..1]. + /// should be the of the same size as , + /// but there are no restrictions on the span's length. /// internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { - GuardAvx2(nameof(BulkConvertByteToNormalizedFloat)); - - DebugGuard.IsTrue((dest.Length % Vector.Count) == 0, nameof(source), "dest.Length should be divisable by Vector.Count!"); - - var bVec = new Vector(256.0f / 255.0f); - var magicFloat = new Vector(32768.0f); - var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f - var mask = new Vector(255); + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); - ref Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); - ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); - - ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); - - int n = dest.Length / 8; - - for (int i = 0; i < n; i++) - { - ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); - ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i); - d.LoadFrom(ref s); - } + ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); + BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); - for (int i = 0; i < n; i++) + // Deal with the remainder: + int count = source.Length; + if (count > 0) { - ref Vector df = ref Unsafe.Add(ref destBaseAsFloat, i); - - var vi = Vector.AsVectorUInt32(df); - vi &= mask; - vi |= magicInt; - - var vf = Vector.AsVectorSingle(vi); - vf = (vf - magicFloat) * bVec; - - df = vf; + // TODO: Do we need to optimize anything on this? (There are at most 7 remainders) + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref float dBase = ref MemoryMarshal.GetReference(dest); + for (int i = 0; i < count; i++) + { + Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, i) / 255f; + } } } /// - /// Same as but clamps overflown values before conversion. + /// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of . + /// The values are scaled up into [0-255] and rounded, overflows are clamped. + /// should be the of the same size as , + /// but there are no restrictions on the span's length. /// internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) { - GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows)); - - DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!"); - - if (source.Length == 0) - { - return; - } - - ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); - ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); - int n = source.Length / 8; - - Vector magick = new Vector(32768.0f); - Vector scale = new Vector(255f) / new Vector(256f); - - // need to copy to a temporary struct, because - // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x) - // does not work. TODO: This might be a CoreClr bug, need to ask/report - var temp = default(Octet.OfUInt32); - ref Vector tempRef = ref Unsafe.As>(ref temp); - - for (int i = 0; i < n; i++) - { - // union { float f; uint32_t i; } u; - // u.f = 32768.0f + x * (255.0f / 256.0f); - // return (uint8_t)u.i; - Vector x = Unsafe.Add(ref srcBase, i); - x = Vector.Max(x, Vector.Zero); - x = Vector.Min(x, Vector.One); - - x = (x * scale) + magick; - tempRef = x; - - ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); - d.LoadFrom(ref temp); - } - } - - // TODO: Replace these with T4-d library level tuples! - internal static class Octet - { - [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))] - public struct OfUInt32 - { - [FieldOffset(0 * sizeof(uint))] - public uint V0; - - [FieldOffset(1 * sizeof(uint))] - public uint V1; - - [FieldOffset(2 * sizeof(uint))] - public uint V2; - - [FieldOffset(3 * sizeof(uint))] - public uint V3; - - [FieldOffset(4 * sizeof(uint))] - public uint V4; - - [FieldOffset(5 * sizeof(uint))] - public uint V5; + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); - [FieldOffset(6 * sizeof(uint))] - public uint V6; + ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); + BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); - [FieldOffset(7 * sizeof(uint))] - public uint V7; - - public override string ToString() - { - return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; - } - - [MethodImpl(InliningOptions.ShortMethod)] - public void LoadFrom(ref OfByte src) - { - this.V0 = src.V0; - this.V1 = src.V1; - this.V2 = src.V2; - this.V3 = src.V3; - this.V4 = src.V4; - this.V5 = src.V5; - this.V6 = src.V6; - this.V7 = src.V7; - } - } - - [StructLayout(LayoutKind.Explicit, Size = 8)] - public struct OfByte + // Deal with the remainder: + int count = source.Length; + if (count > 0) { - [FieldOffset(0)] - public byte V0; - - [FieldOffset(1)] - public byte V1; - - [FieldOffset(2)] - public byte V2; - - [FieldOffset(3)] - public byte V3; - - [FieldOffset(4)] - public byte V4; - - [FieldOffset(5)] - public byte V5; - - [FieldOffset(6)] - public byte V6; - - [FieldOffset(7)] - public byte V7; - - public override string ToString() - { - return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; - } + ref float sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); - [MethodImpl(InliningOptions.ShortMethod)] - public void LoadFrom(ref OfUInt32 src) + for (int i = 0; i < count; i++) { - this.V0 = (byte)src.V0; - this.V1 = (byte)src.V1; - this.V2 = (byte)src.V2; - this.V3 = (byte)src.V3; - this.V4 = (byte)src.V4; - this.V5 = (byte)src.V5; - this.V6 = (byte)src.V6; - this.V7 = (byte)src.V7; + // TODO: Do we need to optimize anything on this? (There are at most 7 remainders) + float f = Unsafe.Add(ref sBase, i); + f *= 255f; + f += 0.5f; + f = MathF.Max(0, f); + f = MathF.Min(255f, f); + + Unsafe.Add(ref dBase, i) = (byte)f; } } } diff --git a/src/ImageSharp/Common/Tuples/Octet.cs b/src/ImageSharp/Common/Tuples/Octet.cs new file mode 100644 index 0000000000..ae01a31217 --- /dev/null +++ b/src/ImageSharp/Common/Tuples/Octet.cs @@ -0,0 +1,100 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp.Tuples +{ + internal static class Octet + { + [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))] + public struct OfUInt32 + { + [FieldOffset(0 * sizeof(uint))] + public uint V0; + + [FieldOffset(1 * sizeof(uint))] + public uint V1; + + [FieldOffset(2 * sizeof(uint))] + public uint V2; + + [FieldOffset(3 * sizeof(uint))] + public uint V3; + + [FieldOffset(4 * sizeof(uint))] + public uint V4; + + [FieldOffset(5 * sizeof(uint))] + public uint V5; + + [FieldOffset(6 * sizeof(uint))] + public uint V6; + + [FieldOffset(7 * sizeof(uint))] + public uint V7; + + public override string ToString() + { + return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; + } + + [MethodImpl(InliningOptions.ShortMethod)] + public void LoadFrom(ref OfByte src) + { + this.V0 = src.V0; + this.V1 = src.V1; + this.V2 = src.V2; + this.V3 = src.V3; + this.V4 = src.V4; + this.V5 = src.V5; + this.V6 = src.V6; + this.V7 = src.V7; + } + } + + [StructLayout(LayoutKind.Explicit, Size = 8)] + public struct OfByte + { + [FieldOffset(0)] + public byte V0; + + [FieldOffset(1)] + public byte V1; + + [FieldOffset(2)] + public byte V2; + + [FieldOffset(3)] + public byte V3; + + [FieldOffset(4)] + public byte V4; + + [FieldOffset(5)] + public byte V5; + + [FieldOffset(6)] + public byte V6; + + [FieldOffset(7)] + public byte V7; + + public override string ToString() + { + return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; + } + + [MethodImpl(InliningOptions.ShortMethod)] + public void LoadFrom(ref OfUInt32 src) + { + this.V0 = (byte)src.V0; + this.V1 = (byte)src.V1; + this.V2 = (byte)src.V2; + this.V3 = (byte)src.V3; + this.V4 = (byte)src.V4; + this.V5 = (byte)src.V5; + this.V6 = (byte)src.V6; + this.V7 = (byte)src.V7; + } + } + } +} \ No newline at end of file diff --git a/src/ImageSharp/Common/Tuples/Vector4Pair.cs b/src/ImageSharp/Common/Tuples/Vector4Pair.cs index 309d5e2e56..5988b2200b 100644 --- a/src/ImageSharp/Common/Tuples/Vector4Pair.cs +++ b/src/ImageSharp/Common/Tuples/Vector4Pair.cs @@ -2,7 +2,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -namespace SixLabors.ImageSharp.Common.Tuples +namespace SixLabors.ImageSharp.Tuples { /// /// Its faster to process multiple Vector4-s together, so let's pair them! diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs index 4b2626c582..5c63a478db 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs @@ -6,7 +6,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using SixLabors.ImageSharp.Common.Tuples; +using SixLabors.ImageSharp.Tuples; namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters { diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs index ab4947e65c..3f26cdc907 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs @@ -6,7 +6,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using SixLabors.ImageSharp.Common.Tuples; +using SixLabors.ImageSharp.Tuples; // ReSharper disable ImpureMethodCallOnReadonlyValueField namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index 60abb7fb2c..293f3bc1f7 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -6,8 +6,8 @@ using System.Collections.Generic; using System.Linq; using System.Numerics; -using SixLabors.ImageSharp.Common.Tuples; using SixLabors.ImageSharp.Memory; +using SixLabors.ImageSharp.Tuples; using SixLabors.Memory; namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs index bfef60c606..564b93ef52 100644 --- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs @@ -37,7 +37,7 @@ namespace SixLabors.ImageSharp.PixelFormats } else { - ConvertToVector4UsingStandardIntrinsics(sourceColors, destinationVectors, count); + ConvertToVector4UsingBasicIntrinsics(sourceColors, destinationVectors, count); } } @@ -58,7 +58,7 @@ namespace SixLabors.ImageSharp.PixelFormats } else { - ConvertFromVector4StandardIntrinsics(sourceVectors, destinationColors, count); + ConvertFromVector4BasicIntrinsics(sourceVectors, destinationColors, count); } } @@ -112,7 +112,7 @@ namespace SixLabors.ImageSharp.PixelFormats } } - private static void ConvertToVector4UsingStandardIntrinsics( + private static void ConvertToVector4UsingBasicIntrinsics( ReadOnlySpan sourceColors, Span destinationVectors, int count) @@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.PixelFormats ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); - SimdUtils.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); + SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); } if (remainder > 0) @@ -155,7 +155,7 @@ namespace SixLabors.ImageSharp.PixelFormats } } - private static void ConvertFromVector4StandardIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count) + private static void ConvertFromVector4BasicIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count) { int remainder = count % 2; int alignedCount = count - remainder; @@ -165,7 +165,7 @@ namespace SixLabors.ImageSharp.PixelFormats ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount)); Span rawDest = MemoryMarshal.Cast(destinationColors); - SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); + SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); } if (remainder > 0) diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 726e214a96..855e9e4b97 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -30,8 +30,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Params( //64, //256, - //512, - 2048 + 512 + //1024 )] public int Count { get; set; } @@ -117,7 +117,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } - //[Benchmark] + [Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -159,7 +159,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - //[Benchmark] + [Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs index be19e719a8..ca85a350cc 100644 --- a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs +++ b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs @@ -5,6 +5,7 @@ using BenchmarkDotNet.Attributes; namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization { + [Config(typeof(Config.ShortClr))] public class UInt32ToSingle { private float[] data; @@ -66,8 +67,7 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization Unsafe.Add(ref bf, i) = v; } } - - // This code is not correct at all, it's just here as reference + [Benchmark] public void StandardSimdFromInt() { @@ -86,5 +86,28 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization Unsafe.Add(ref bf, i) = v; } } + + + [Benchmark] + public void StandardSimdFromInt_RefCast() + { + int n = Count / Vector.Count; + + ref Vector bf = ref Unsafe.As>(ref this.data[0]); + ref Vector bu = ref Unsafe.As, Vector>(ref bf); + + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) + { + ref Vector fRef = ref Unsafe.Add(ref bf, i); + + Vector du = Vector.AsVectorInt32(fRef); + Vector v = Vector.ConvertToSingle(du); + v *= scale; + + fRef = v; + } + } } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs index f71f6ec1bf..2bc3af4c98 100644 --- a/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs +++ b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs @@ -3,8 +3,11 @@ using System.Runtime.CompilerServices; using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Tuples; + namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization { + [Config(typeof(Config.ShortClr))] public class WidenBytesToUInt32 { private byte[] source; @@ -25,8 +28,8 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization { const int N = Count / 8; - ref SimdUtils.Octet.OfByte sBase = ref Unsafe.As(ref this.source[0]); - ref SimdUtils.Octet.OfUInt32 dBase = ref Unsafe.As(ref this.dest[0]); + ref Octet.OfByte sBase = ref Unsafe.As(ref this.source[0]); + ref Octet.OfUInt32 dBase = ref Unsafe.As(ref this.dest[0]); for (int i = 0; i < N; i++) { diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 4e1717bda9..2dcba2b74b 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -62,7 +62,7 @@ namespace SixLabors.ImageSharp.Tests.Common { float[] data = new float[Vector.Count]; - var rnd = new Random(); + var rnd = new Random(seed); for (int i = 0; i < Vector.Count; i++) { @@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common [InlineData(1, 8)] [InlineData(2, 16)] [InlineData(3, 128)] - public void BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count) + public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count) { if (this.SkipOnNonAvx2()) { @@ -130,7 +130,7 @@ namespace SixLabors.ImageSharp.Tests.Common byte[] dest = new byte[count]; - SimdUtils.BulkConvertNormalizedFloatToByte(normalized, dest); + SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(normalized, dest); byte[] expected = orig.Select(f => (byte)(f)).ToArray(); @@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common [InlineData(1, 8)] [InlineData(2, 16)] [InlineData(3, 128)] - public void BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count) + public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count) { if (this.SkipOnNonAvx2()) { @@ -153,87 +153,113 @@ namespace SixLabors.ImageSharp.Tests.Common byte[] dest = new byte[count]; - SimdUtils.BulkConvertNormalizedFloatToByte(source, dest); + SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(source, dest); byte[] expected = source.Select(f => (byte)Math.Round(f * 255f)).ToArray(); Assert.Equal(expected, dest); } + public static readonly TheoryData ArraySizesDivisibleBy8 = new TheoryData { 0, 8, 16, 1024 }; + + public static readonly TheoryData ArraySizesDivisibleBy32 = new TheoryData { 0, 32, 512 }; + + public static readonly TheoryData ArbitraryArraySizes = + new TheoryData + { + 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520, + }; [Theory] - [InlineData(1, 0)] - [InlineData(2, 32)] - [InlineData(3, 128)] - public void BulkConvertByteToNormalizedFloat(int seed, int count) + [MemberData(nameof(ArraySizesDivisibleBy8))] + public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count) { if (this.SkipOnNonAvx2()) { return; } - byte[] source = new Random(seed).GenerateRandomByteArray(count); - float[] result = new float[count]; - float[] expected = source.Select(b => (float)b / 255f).ToArray(); - - SimdUtils.BulkConvertByteToNormalizedFloat(source, result); - - Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); + TestImpl_BulkConvertByteToNormalizedFloat( + count, + (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(s.Span, d.Span)); } [Theory] - [InlineData(1, 0)] - [InlineData(2, 32)] - [InlineData(3, 128)] - public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count) + [MemberData(nameof(ArraySizesDivisibleBy32))] + public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int count) + { + TestImpl_BulkConvertByteToNormalizedFloat( + count, + (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(s.Span, d.Span)); + } + + [Theory] + [MemberData(nameof(ArbitraryArraySizes))] + public void BulkConvertByteToNormalizedFloat(int count) + { + TestImpl_BulkConvertByteToNormalizedFloat( + count, + (s, d) => SimdUtils.BulkConvertByteToNormalizedFloat(s.Span, d.Span)); + } + + private static void TestImpl_BulkConvertByteToNormalizedFloat( + int count, + Action, Memory> convert) { - byte[] source = new Random(seed).GenerateRandomByteArray(count); + byte[] source = new Random(count).GenerateRandomByteArray(count); float[] result = new float[count]; float[] expected = source.Select(b => (float)b / 255f).ToArray(); - - SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result); + convert(source, result); Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); } - - public static readonly TheoryData BulkConvertNormalizedFloatToByteClampOverflows_Data = - new TheoryData - { - 0, 64, 1024 - }; - [Theory] - [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))] - public void BulkConvertNormalizedFloatToByteClampOverflows(int count) + [MemberData(nameof(ArraySizesDivisibleBy8))] + public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) { if (this.SkipOnNonAvx2()) { return; } - float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f); - byte[] expected = source.Select(NormalizedFloatToByte).ToArray(); - byte[] actual = new byte[count]; - - SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(source, actual); - - Assert.Equal(expected, actual); + TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count, + (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span) + ); } [Theory] - [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))] + [MemberData(nameof(ArraySizesDivisibleBy32))] public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) + { + TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count, + (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span) + ); + } + + [Theory] + [MemberData(nameof(ArbitraryArraySizes))] + public void BulkConvertNormalizedFloatToByteClampOverflows(int count) + { + TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count, + (s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span) + ); + } + + private static void TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( + int count, + Action, Memory> convert) { float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f); byte[] expected = source.Select(NormalizedFloatToByte).ToArray(); byte[] actual = new byte[count]; - SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(source, actual); + convert(source, actual); Assert.Equal(expected, actual); } + private static byte NormalizedFloatToByte(float f) => (byte)Math.Min(255f, Math.Max(0f, f * 255f + 0.5f)); [Theory]