mirror of https://github.com/SixLabors/ImageSharp
committed by
GitHub
28 changed files with 2294 additions and 1136 deletions
@ -1,232 +0,0 @@ |
|||
// Copyright (c) Six Labors and contributors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Diagnostics; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
/// <summary>
|
|||
/// Various extension and utility methods for <see cref="Vector4"/> and <see cref="Vector{T}"/> utilizing SIMD capabilities
|
|||
/// </summary>
|
|||
internal static class SimdUtils |
|||
{ |
|||
/// <summary>
|
|||
/// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte.
|
|||
/// </summary>
|
|||
public static bool IsAvx2CompatibleArchitecture => Vector<float>.Count == 8 && Vector<int>.Count == 8; |
|||
|
|||
internal static void GuardAvx2(string operation) |
|||
{ |
|||
if (!IsAvx2CompatibleArchitecture) |
|||
{ |
|||
throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!"); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Transform all scalars in 'v' in a way that converting them to <see cref="int"/> would have rounding semantics.
|
|||
/// </summary>
|
|||
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|||
internal static Vector4 PseudoRound(this Vector4 v) |
|||
{ |
|||
var sign = Vector4.Clamp(v, new Vector4(-1), new Vector4(1)); |
|||
|
|||
return v + (sign * 0.5f); |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Rounds all values in 'v' to the nearest integer following <see cref="MidpointRounding.ToEven"/> semantics.
|
|||
/// Source:
|
|||
/// <see>
|
|||
/// <cref>https://github.com/g-truc/glm/blob/master/glm/simd/common.h#L110</cref>
|
|||
/// </see>
|
|||
/// </summary>
|
|||
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|||
internal static Vector<float> FastRound(this Vector<float> x) |
|||
{ |
|||
Vector<int> magic0 = new Vector<int>(int.MinValue); // 0x80000000
|
|||
Vector<float> sgn0 = Vector.AsVectorSingle(magic0); |
|||
Vector<float> and0 = Vector.BitwiseAnd(sgn0, x); |
|||
Vector<float> or0 = Vector.BitwiseOr(and0, new Vector<float>(8388608.0f)); |
|||
Vector<float> add0 = Vector.Add(x, or0); |
|||
Vector<float> sub0 = Vector.Subtract(add0, or0); |
|||
return sub0; |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/> values.
|
|||
/// The values gonna be scaled up into [0-255] and rounded.
|
|||
/// Based on:
|
|||
/// <see>
|
|||
/// <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
|
|||
/// </see>
|
|||
/// </summary>
|
|||
internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest) |
|||
{ |
|||
GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); |
|||
|
|||
DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!"); |
|||
|
|||
if (source.Length == 0) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source)); |
|||
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 8; |
|||
|
|||
Vector<float> magick = new Vector<float>(32768.0f); |
|||
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f); |
|||
|
|||
// need to copy to a temporary struct, because
|
|||
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
|
|||
// does not work. TODO: This might be a CoreClr bug, need to ask/report
|
|||
var temp = default(Octet.OfUInt32); |
|||
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
// union { float f; uint32_t i; } u;
|
|||
// u.f = 32768.0f + x * (255.0f / 256.0f);
|
|||
// return (uint8_t)u.i;
|
|||
Vector<float> x = Unsafe.Add(ref srcBase, i); |
|||
x = (x * scale) + magick; |
|||
tempRef = x; |
|||
|
|||
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); |
|||
d.LoadFrom(ref temp); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Same as <see cref="BulkConvertNormalizedFloatToByte"/> but clamps overflown values before conversion.
|
|||
/// </summary>
|
|||
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest) |
|||
{ |
|||
GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); |
|||
|
|||
DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!"); |
|||
|
|||
if (source.Length == 0) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source)); |
|||
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 8; |
|||
|
|||
Vector<float> magick = new Vector<float>(32768.0f); |
|||
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f); |
|||
|
|||
// need to copy to a temporary struct, because
|
|||
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
|
|||
// does not work. TODO: This might be a CoreClr bug, need to ask/report
|
|||
var temp = default(Octet.OfUInt32); |
|||
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
// union { float f; uint32_t i; } u;
|
|||
// u.f = 32768.0f + x * (255.0f / 256.0f);
|
|||
// return (uint8_t)u.i;
|
|||
Vector<float> x = Unsafe.Add(ref srcBase, i); |
|||
x = Vector.Max(x, Vector<float>.Zero); |
|||
x = Vector.Min(x, Vector<float>.One); |
|||
|
|||
x = (x * scale) + magick; |
|||
tempRef = x; |
|||
|
|||
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); |
|||
d.LoadFrom(ref temp); |
|||
} |
|||
} |
|||
|
|||
// TODO: Replace these with T4-d library level tuples!
|
|||
internal static class Octet |
|||
{ |
|||
[StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))] |
|||
public struct OfUInt32 |
|||
{ |
|||
[FieldOffset(0 * sizeof(uint))] |
|||
public uint V0; |
|||
|
|||
[FieldOffset(1 * sizeof(uint))] |
|||
public uint V1; |
|||
|
|||
[FieldOffset(2 * sizeof(uint))] |
|||
public uint V2; |
|||
|
|||
[FieldOffset(3 * sizeof(uint))] |
|||
public uint V3; |
|||
|
|||
[FieldOffset(4 * sizeof(uint))] |
|||
public uint V4; |
|||
|
|||
[FieldOffset(5 * sizeof(uint))] |
|||
public uint V5; |
|||
|
|||
[FieldOffset(6 * sizeof(uint))] |
|||
public uint V6; |
|||
|
|||
[FieldOffset(7 * sizeof(uint))] |
|||
public uint V7; |
|||
|
|||
public override string ToString() |
|||
{ |
|||
return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; |
|||
} |
|||
} |
|||
|
|||
[StructLayout(LayoutKind.Explicit, Size = 8)] |
|||
public struct OfByte |
|||
{ |
|||
[FieldOffset(0)] |
|||
public byte V0; |
|||
|
|||
[FieldOffset(1)] |
|||
public byte V1; |
|||
|
|||
[FieldOffset(2)] |
|||
public byte V2; |
|||
|
|||
[FieldOffset(3)] |
|||
public byte V3; |
|||
|
|||
[FieldOffset(4)] |
|||
public byte V4; |
|||
|
|||
[FieldOffset(5)] |
|||
public byte V5; |
|||
|
|||
[FieldOffset(6)] |
|||
public byte V6; |
|||
|
|||
[FieldOffset(7)] |
|||
public byte V7; |
|||
|
|||
public override string ToString() |
|||
{ |
|||
return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; |
|||
} |
|||
|
|||
public void LoadFrom(ref OfUInt32 i) |
|||
{ |
|||
this.V0 = (byte)i.V0; |
|||
this.V1 = (byte)i.V1; |
|||
this.V2 = (byte)i.V2; |
|||
this.V3 = (byte)i.V3; |
|||
this.V4 = (byte)i.V4; |
|||
this.V5 = (byte)i.V5; |
|||
this.V6 = (byte)i.V6; |
|||
this.V7 = (byte)i.V7; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,215 @@ |
|||
// Copyright (c) Six Labors and contributors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Diagnostics; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using SixLabors.ImageSharp.Tuples; |
|||
|
|||
// ReSharper disable MemberHidesStaticFromOuterClass
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
internal static partial class SimdUtils |
|||
{ |
|||
/// <summary>
|
|||
/// Implementation with 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen etc.)
|
|||
/// </summary>
|
|||
public static class BasicIntrinsics256 |
|||
{ |
|||
public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture; |
|||
|
|||
/// <summary>
|
|||
/// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void BulkConvertByteToNormalizedFloatReduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<float> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
if (!IsAvailable) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
int remainder = ImageMaths.Modulo8(source.Length); |
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
BulkConvertByteToNormalizedFloat( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( |
|||
ref ReadOnlySpan<float> source, |
|||
ref Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
if (!IsAvailable) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
int remainder = ImageMaths.Modulo8(source.Length); |
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// SIMD optimized implementation for <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>.
|
|||
/// Works only with span Length divisible by 8.
|
|||
/// Implementation adapted from:
|
|||
/// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
|
|||
/// http://stackoverflow.com/a/536278
|
|||
/// </summary>
|
|||
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest) |
|||
{ |
|||
VerifyIsAvx2Compatible(nameof(BulkConvertByteToNormalizedFloat)); |
|||
VerifySpanInput(source, dest, 8); |
|||
|
|||
var bVec = new Vector<float>(256.0f / 255.0f); |
|||
var magicFloat = new Vector<float>(32768.0f); |
|||
var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
|
|||
var mask = new Vector<uint>(255); |
|||
|
|||
ref Octet.OfByte sourceBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(source)); |
|||
ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As<float, Octet.OfUInt32>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
ref Vector<float> destBaseAsFloat = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref destBaseAsWideOctet); |
|||
|
|||
int n = dest.Length / 8; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); |
|||
ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i); |
|||
d.LoadFrom(ref s); |
|||
} |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
ref Vector<float> df = ref Unsafe.Add(ref destBaseAsFloat, i); |
|||
|
|||
var vi = Vector.AsVectorUInt32(df); |
|||
vi &= mask; |
|||
vi |= magicInt; |
|||
|
|||
var vf = Vector.AsVectorSingle(vi); |
|||
vf = (vf - magicFloat) * bVec; |
|||
|
|||
df = vf; |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> which is faster on older runtimes.
|
|||
/// </summary>
|
|||
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest) |
|||
{ |
|||
VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByteClampOverflows)); |
|||
VerifySpanInput(source, dest, 8); |
|||
|
|||
if (source.Length == 0) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source)); |
|||
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 8; |
|||
|
|||
Vector<float> magick = new Vector<float>(32768.0f); |
|||
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f); |
|||
|
|||
// need to copy to a temporary struct, because
|
|||
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
|
|||
// does not work. TODO: This might be a CoreClr bug, need to ask/report
|
|||
var temp = default(Octet.OfUInt32); |
|||
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
// union { float f; uint32_t i; } u;
|
|||
// u.f = 32768.0f + x * (255.0f / 256.0f);
|
|||
// return (uint8_t)u.i;
|
|||
Vector<float> x = Unsafe.Add(ref srcBase, i); |
|||
x = Vector.Max(x, Vector<float>.Zero); |
|||
x = Vector.Min(x, Vector<float>.One); |
|||
|
|||
x = (x * scale) + magick; |
|||
tempRef = x; |
|||
|
|||
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); |
|||
d.LoadFrom(ref temp); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Convert all <see cref="float"/> values normalized into [0..1] from 'source'
|
|||
/// into 'dest' buffer of <see cref="byte"/>. The values are scaled up into [0-255] and rounded.
|
|||
/// This implementation is SIMD optimized and works only when span Length is divisible by 8.
|
|||
/// Based on:
|
|||
/// <see>
|
|||
/// <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
|
|||
/// </see>
|
|||
/// </summary>
|
|||
internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest) |
|||
{ |
|||
VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByte)); |
|||
VerifySpanInput(source, dest, 8); |
|||
|
|||
if (source.Length == 0) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source)); |
|||
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest)); |
|||
int n = source.Length / 8; |
|||
|
|||
Vector<float> magick = new Vector<float>(32768.0f); |
|||
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f); |
|||
|
|||
// need to copy to a temporary struct, because
|
|||
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
|
|||
// does not work. TODO: This might be a CoreClr bug, need to ask/report
|
|||
var temp = default(Octet.OfUInt32); |
|||
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
// union { float f; uint32_t i; } u;
|
|||
// u.f = 32768.0f + x * (255.0f / 256.0f);
|
|||
// return (uint8_t)u.i;
|
|||
Vector<float> x = Unsafe.Add(ref srcBase, i); |
|||
x = (x * scale) + magick; |
|||
tempRef = x; |
|||
|
|||
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); |
|||
d.LoadFrom(ref temp); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,178 @@ |
|||
using System; |
|||
using System.Diagnostics; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
// ReSharper disable MemberHidesStaticFromOuterClass
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
internal static partial class SimdUtils |
|||
{ |
|||
/// <summary>
|
|||
/// Implementation methods based on newer <see cref="Vector{T}"/> API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*).
|
|||
/// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
|
|||
/// See:
|
|||
/// https://github.com/dotnet/coreclr/pull/10662
|
|||
/// API Proposal:
|
|||
/// https://github.com/dotnet/corefx/issues/15957
|
|||
/// </summary>
|
|||
public static class ExtendedIntrinsics |
|||
{ |
|||
public static bool IsAvailable { get; } = |
|||
#if NETCOREAPP2_1
|
|||
// TODO: Also available in .NET 4.7.2, we need to add a build target!
|
|||
Vector.IsHardwareAccelerated; |
|||
#else
|
|||
false; |
|||
#endif
|
|||
|
|||
/// <summary>
|
|||
/// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void BulkConvertByteToNormalizedFloatReduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<float> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
if (!IsAvailable) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count); |
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
BulkConvertByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( |
|||
ref ReadOnlySpan<float> source, |
|||
ref Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
if (!IsAvailable) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count); |
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
BulkConvertNormalizedFloatToByteClampOverflows( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Implementation <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
|
|||
/// </summary>
|
|||
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest) |
|||
{ |
|||
VerifySpanInput(source, dest, Vector<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector<byte>.Count; |
|||
|
|||
ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(source)); |
|||
ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
Vector<byte> b = Unsafe.Add(ref sourceBase, i); |
|||
|
|||
Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1); |
|||
Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1); |
|||
Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3); |
|||
|
|||
Vector<float> f0 = ConvertToSingle(w0); |
|||
Vector<float> f1 = ConvertToSingle(w1); |
|||
Vector<float> f2 = ConvertToSingle(w2); |
|||
Vector<float> f3 = ConvertToSingle(w3); |
|||
|
|||
ref Vector<float> d = ref Unsafe.Add(ref destBase, i * 4); |
|||
d = f0; |
|||
Unsafe.Add(ref d, 1) = f1; |
|||
Unsafe.Add(ref d, 2) = f2; |
|||
Unsafe.Add(ref d, 3) = f3; |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
|
|||
/// </summary>
|
|||
internal static void BulkConvertNormalizedFloatToByteClampOverflows( |
|||
ReadOnlySpan<float> source, |
|||
Span<byte> dest) |
|||
{ |
|||
VerifySpanInput(source, dest, Vector<byte>.Count); |
|||
|
|||
int n = dest.Length / Vector<byte>.Count; |
|||
|
|||
ref Vector<float> sourceBase = |
|||
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source)); |
|||
ref Vector<byte> destBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
ref Vector<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|||
|
|||
Vector<float> f0 = s; |
|||
Vector<float> f1 = Unsafe.Add(ref s, 1); |
|||
Vector<float> f2 = Unsafe.Add(ref s, 2); |
|||
Vector<float> f3 = Unsafe.Add(ref s, 3); |
|||
|
|||
Vector<uint> w0 = ConvertToUInt32(f0); |
|||
Vector<uint> w1 = ConvertToUInt32(f1); |
|||
Vector<uint> w2 = ConvertToUInt32(f2); |
|||
Vector<uint> w3 = ConvertToUInt32(f3); |
|||
|
|||
Vector<ushort> u0 = Vector.Narrow(w0, w1); |
|||
Vector<ushort> u1 = Vector.Narrow(w2, w3); |
|||
|
|||
Vector<byte> b = Vector.Narrow(u0, u1); |
|||
|
|||
Unsafe.Add(ref destBase, i) = b; |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|||
private static Vector<uint> ConvertToUInt32(Vector<float> vf) |
|||
{ |
|||
Vector<float> maxBytes = new Vector<float>(255f); |
|||
vf *= maxBytes; |
|||
vf += new Vector<float>(0.5f); |
|||
vf = Vector.Min(Vector.Max(vf, Vector<float>.Zero), maxBytes); |
|||
Vector<int> vi = Vector.ConvertToInt32(vf); |
|||
return Vector.AsVectorUInt32(vi); |
|||
} |
|||
|
|||
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|||
private static Vector<float> ConvertToSingle(Vector<uint> u) |
|||
{ |
|||
Vector<int> vi = Vector.AsVectorInt32(u); |
|||
Vector<float> v = Vector.ConvertToSingle(vi); |
|||
v *= new Vector<float>(1f / 255f); |
|||
return v; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,151 @@ |
|||
// Copyright (c) Six Labors and contributors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
// ReSharper disable MemberHidesStaticFromOuterClass
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
internal static partial class SimdUtils |
|||
{ |
|||
/// <summary>
|
|||
/// Fallback implementation based on <see cref="Vector4"/> (128bit).
|
|||
/// For <see cref="Vector4"/>, efficient software fallback implementations are present,
|
|||
/// and we hope that even mono's JIT is able to emit SIMD instructions for that type :P
|
|||
/// </summary>
|
|||
public static class FallbackIntrinsics128 |
|||
{ |
|||
/// <summary>
|
|||
/// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void BulkConvertByteToNormalizedFloatReduce( |
|||
ref ReadOnlySpan<byte> source, |
|||
ref Span<float> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
int remainder = ImageMaths.Modulo4(source.Length); |
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
BulkConvertByteToNormalizedFloat( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( |
|||
ref ReadOnlySpan<float> source, |
|||
ref Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
int remainder = ImageMaths.Modulo4(source.Length); |
|||
int adjustedCount = source.Length - remainder; |
|||
|
|||
if (adjustedCount > 0) |
|||
{ |
|||
BulkConvertNormalizedFloatToByteClampOverflows( |
|||
source.Slice(0, adjustedCount), |
|||
dest.Slice(0, adjustedCount)); |
|||
|
|||
source = source.Slice(adjustedCount); |
|||
dest = dest.Slice(adjustedCount); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Implementation of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/> using <see cref="Vector4"/>.
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ColdPath)] |
|||
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest) |
|||
{ |
|||
VerifySpanInput(source, dest, 4); |
|||
|
|||
int count = dest.Length / 4; |
|||
if (count == 0) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
ref ByteVector4 sBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(source)); |
|||
ref Vector4 dBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
const float Scale = 1f / 255f; |
|||
Vector4 d = default; |
|||
|
|||
for (int i = 0; i < count; i++) |
|||
{ |
|||
ref ByteVector4 s = ref Unsafe.Add(ref sBase, i); |
|||
d.X = s.X; |
|||
d.Y = s.Y; |
|||
d.Z = s.Z; |
|||
d.W = s.W; |
|||
d *= Scale; |
|||
Unsafe.Add(ref dBase, i) = d; |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> using <see cref="Vector4"/>.
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ColdPath)] |
|||
internal static void BulkConvertNormalizedFloatToByteClampOverflows( |
|||
ReadOnlySpan<float> source, |
|||
Span<byte> dest) |
|||
{ |
|||
VerifySpanInput(source, dest, 4); |
|||
|
|||
int count = source.Length / 4; |
|||
if (count == 0) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
ref Vector4 sBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(source)); |
|||
ref ByteVector4 dBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(dest)); |
|||
|
|||
var half = new Vector4(0.5f); |
|||
var maxBytes = new Vector4(255f); |
|||
|
|||
for (int i = 0; i < count; i++) |
|||
{ |
|||
Vector4 s = Unsafe.Add(ref sBase, i); |
|||
s *= maxBytes; |
|||
s += half; |
|||
|
|||
// I'm not sure if Vector4.Clamp() is properly implemented with intrinsics.
|
|||
s = Vector4.Max(Vector4.Zero, s); |
|||
s = Vector4.Min(maxBytes, s); |
|||
|
|||
ref ByteVector4 d = ref Unsafe.Add(ref dBase, i); |
|||
d.X = (byte)s.X; |
|||
d.Y = (byte)s.Y; |
|||
d.Z = (byte)s.Z; |
|||
d.W = (byte)s.W; |
|||
} |
|||
} |
|||
|
|||
[StructLayout(LayoutKind.Sequential)] |
|||
private struct ByteVector4 |
|||
{ |
|||
public byte X; |
|||
public byte Y; |
|||
public byte Z; |
|||
public byte W; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,185 @@ |
|||
// Copyright (c) Six Labors and contributors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Diagnostics; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
using SixLabors.ImageSharp.PixelFormats; |
|||
using SixLabors.ImageSharp.Tuples; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
/// <summary>
|
|||
/// Various extension and utility methods for <see cref="Vector4"/> and <see cref="Vector{T}"/> utilizing SIMD capabilities
|
|||
/// </summary>
|
|||
internal static partial class SimdUtils |
|||
{ |
|||
/// <summary>
|
|||
/// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte.
|
|||
/// </summary>
|
|||
public static bool IsAvx2CompatibleArchitecture { get; } = |
|||
Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8; |
|||
|
|||
/// <summary>
|
|||
/// Transform all scalars in 'v' in a way that converting them to <see cref="int"/> would have rounding semantics.
|
|||
/// </summary>
|
|||
/// <param name="v">The vector</param>
|
|||
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|||
internal static Vector4 PseudoRound(this Vector4 v) |
|||
{ |
|||
var sign = Vector4.Clamp(v, new Vector4(-1), new Vector4(1)); |
|||
|
|||
return v + (sign * 0.5f); |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Rounds all values in 'v' to the nearest integer following <see cref="MidpointRounding.ToEven"/> semantics.
|
|||
/// Source:
|
|||
/// <see>
|
|||
/// <cref>https://github.com/g-truc/glm/blob/master/glm/simd/common.h#L110</cref>
|
|||
/// </see>
|
|||
/// </summary>
|
|||
/// <param name="v">The vector</param>
|
|||
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|||
internal static Vector<float> FastRound(this Vector<float> v) |
|||
{ |
|||
Vector<int> magic0 = new Vector<int>(int.MinValue); // 0x80000000
|
|||
Vector<float> sgn0 = Vector.AsVectorSingle(magic0); |
|||
Vector<float> and0 = Vector.BitwiseAnd(sgn0, v); |
|||
Vector<float> or0 = Vector.BitwiseOr(and0, new Vector<float>(8388608.0f)); |
|||
Vector<float> add0 = Vector.Add(v, or0); |
|||
Vector<float> sub0 = Vector.Subtract(add0, or0); |
|||
return sub0; |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
|
|||
/// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
|
|||
/// but there are no restrictions on the span's length.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of bytes</param>
|
|||
/// <param name="dest">The destination span of floats</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
#if NETCOREAPP2_1
|
|||
ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); |
|||
#else
|
|||
BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); |
|||
#endif
|
|||
FallbackIntrinsics128.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); |
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
ConverByteToNormalizedFloatRemainder(source, dest); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
|
|||
/// The values are scaled up into [0-255] and rounded, overflows are clamped.
|
|||
/// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
|
|||
/// but there are no restrictions on the span's length.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of floats</param>
|
|||
/// <param name="dest">The destination span of bytes</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
|
|||
#if NETCOREAPP2_1
|
|||
ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); |
|||
#else
|
|||
BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); |
|||
#endif
|
|||
FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); |
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
ConvertNormalizedFloatToByteRemainder(source, dest); |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ColdPath)] |
|||
private static void ConverByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> dest) |
|||
{ |
|||
ref byte sBase = ref MemoryMarshal.GetReference(source); |
|||
ref float dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
// There are at most 3 elements at this point, having a for loop is overkill.
|
|||
// Let's minimize the no. of instructions!
|
|||
switch (source.Length) |
|||
{ |
|||
case 3: |
|||
Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2) / 255f; |
|||
goto case 2; |
|||
case 2: |
|||
Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1) / 255f; |
|||
goto case 1; |
|||
case 1: |
|||
dBase = sBase / 255f; |
|||
break; |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ColdPath)] |
|||
private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan<float> source, Span<byte> dest) |
|||
{ |
|||
ref float sBase = ref MemoryMarshal.GetReference(source); |
|||
ref byte dBase = ref MemoryMarshal.GetReference(dest); |
|||
|
|||
switch (source.Length) |
|||
{ |
|||
case 3: |
|||
Unsafe.Add(ref dBase, 2) = ConvertToByte(Unsafe.Add(ref sBase, 2)); |
|||
goto case 2; |
|||
case 2: |
|||
Unsafe.Add(ref dBase, 1) = ConvertToByte(Unsafe.Add(ref sBase, 1)); |
|||
goto case 1; |
|||
case 1: |
|||
dBase = ConvertToByte(sBase); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static byte ConvertToByte(float f) => (byte)ComparableExtensions.Clamp((f * 255f) + 0.5f, 0, 255f); |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifyIsAvx2Compatible(string operation) |
|||
{ |
|||
if (!IsAvx2CompatibleArchitecture) |
|||
{ |
|||
throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!"); |
|||
} |
|||
} |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest, int shouldBeDivisibleBy) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
DebugGuard.IsTrue( |
|||
ImageMaths.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0, |
|||
nameof(source), |
|||
$"length should be divisable by {shouldBeDivisibleBy}!"); |
|||
} |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifySpanInput(ReadOnlySpan<float> source, Span<byte> dest, int shouldBeDivisibleBy) |
|||
{ |
|||
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|||
DebugGuard.IsTrue( |
|||
ImageMaths.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0, |
|||
nameof(source), |
|||
$"length should be divisable by {shouldBeDivisibleBy}!"); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,109 @@ |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
namespace SixLabors.ImageSharp.Tuples |
|||
{ |
|||
/// <summary>
|
|||
/// Contains 8 element value tuples of various types.
|
|||
/// </summary>
|
|||
internal static class Octet |
|||
{ |
|||
/// <summary>
|
|||
/// Value tuple of <see cref="uint"/>-s
|
|||
/// </summary>
|
|||
[StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))] |
|||
public struct OfUInt32 |
|||
{ |
|||
[FieldOffset(0 * sizeof(uint))] |
|||
public uint V0; |
|||
|
|||
[FieldOffset(1 * sizeof(uint))] |
|||
public uint V1; |
|||
|
|||
[FieldOffset(2 * sizeof(uint))] |
|||
public uint V2; |
|||
|
|||
[FieldOffset(3 * sizeof(uint))] |
|||
public uint V3; |
|||
|
|||
[FieldOffset(4 * sizeof(uint))] |
|||
public uint V4; |
|||
|
|||
[FieldOffset(5 * sizeof(uint))] |
|||
public uint V5; |
|||
|
|||
[FieldOffset(6 * sizeof(uint))] |
|||
public uint V6; |
|||
|
|||
[FieldOffset(7 * sizeof(uint))] |
|||
public uint V7; |
|||
|
|||
public override string ToString() |
|||
{ |
|||
return $"{nameof(Octet)}.{nameof(OfUInt32)}({this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7})"; |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void LoadFrom(ref OfByte src) |
|||
{ |
|||
this.V0 = src.V0; |
|||
this.V1 = src.V1; |
|||
this.V2 = src.V2; |
|||
this.V3 = src.V3; |
|||
this.V4 = src.V4; |
|||
this.V5 = src.V5; |
|||
this.V6 = src.V6; |
|||
this.V7 = src.V7; |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Value tuple of <see cref="byte"/>-s
|
|||
/// </summary>
|
|||
[StructLayout(LayoutKind.Explicit, Size = 8)] |
|||
public struct OfByte |
|||
{ |
|||
[FieldOffset(0)] |
|||
public byte V0; |
|||
|
|||
[FieldOffset(1)] |
|||
public byte V1; |
|||
|
|||
[FieldOffset(2)] |
|||
public byte V2; |
|||
|
|||
[FieldOffset(3)] |
|||
public byte V3; |
|||
|
|||
[FieldOffset(4)] |
|||
public byte V4; |
|||
|
|||
[FieldOffset(5)] |
|||
public byte V5; |
|||
|
|||
[FieldOffset(6)] |
|||
public byte V6; |
|||
|
|||
[FieldOffset(7)] |
|||
public byte V7; |
|||
|
|||
public override string ToString() |
|||
{ |
|||
return $"{nameof(Octet)}.{nameof(OfByte)}({this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7})"; |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void LoadFrom(ref OfUInt32 src) |
|||
{ |
|||
this.V0 = (byte)src.V0; |
|||
this.V1 = (byte)src.V1; |
|||
this.V2 = (byte)src.V2; |
|||
this.V3 = (byte)src.V3; |
|||
this.V4 = (byte)src.V4; |
|||
this.V5 = (byte)src.V5; |
|||
this.V6 = (byte)src.V6; |
|||
this.V7 = (byte)src.V7; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -1,9 +1,9 @@ |
|||
namespace SixLabors.ImageSharp.Benchmarks.General |
|||
{ |
|||
using System; |
|||
using System; |
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath |
|||
{ |
|||
public class Abs |
|||
{ |
|||
[Params(-1, 1)] |
|||
@ -0,0 +1,70 @@ |
|||
using System; |
|||
using System.Runtime.CompilerServices; |
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using BenchmarkDotNet.Running; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath |
|||
{ |
|||
public class ClampFloat |
|||
{ |
|||
private readonly float min = -1.5f; |
|||
private readonly float max = 2.5f; |
|||
private static readonly float[] Values = { -10, -5, -3, -1.5f, -0.5f, 0f, 1f, 1.5f, 2.5f, 3, 10 }; |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public float UsingMathF() |
|||
{ |
|||
float acc = 0; |
|||
|
|||
for (int i = 0; i < Values.Length; i++) |
|||
{ |
|||
acc += ClampUsingMathF(Values[i], this.min, this.max); |
|||
} |
|||
|
|||
return acc; |
|||
} |
|||
|
|||
[Benchmark] |
|||
public float UsingBranching() |
|||
{ |
|||
float acc = 0; |
|||
|
|||
for (int i = 0; i < Values.Length; i++) |
|||
{ |
|||
acc += ClampUsingBranching(Values[i], this.min, this.max); |
|||
} |
|||
|
|||
return acc; |
|||
} |
|||
|
|||
|
|||
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|||
private static float ClampUsingMathF(float x, float min, float max) |
|||
{ |
|||
return Math.Min(max, Math.Max(min, x)); |
|||
} |
|||
|
|||
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|||
private static float ClampUsingBranching(float x, float min, float max) |
|||
{ |
|||
if (x >= max) |
|||
{ |
|||
return max; |
|||
} |
|||
|
|||
if (x <= min) |
|||
{ |
|||
return min; |
|||
} |
|||
|
|||
return x; |
|||
} |
|||
|
|||
// RESULTS:
|
|||
// Method | Mean | Error | StdDev | Scaled |
|
|||
// --------------- |---------:|----------:|----------:|-------:|
|
|||
// UsingMathF | 30.37 ns | 0.3764 ns | 0.3337 ns | 1.00 |
|
|||
// UsingBranching | 18.66 ns | 0.1043 ns | 0.0871 ns | 0.61 |
|
|||
} |
|||
} |
|||
@ -0,0 +1,23 @@ |
|||
using BenchmarkDotNet.Attributes; |
|||
using BenchmarkDotNet.Attributes.Jobs; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath |
|||
{ |
|||
[LongRunJob] |
|||
public class ModuloPowerOfTwoConstant |
|||
{ |
|||
private readonly int value = 42; |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public int Standard() |
|||
{ |
|||
return this.value % 8; |
|||
} |
|||
|
|||
[Benchmark] |
|||
public int Bitwise() |
|||
{ |
|||
return ImageMaths.Modulo8(this.value); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,32 @@ |
|||
using BenchmarkDotNet.Attributes; |
|||
using BenchmarkDotNet.Attributes.Jobs; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath |
|||
{ |
|||
[LongRunJob] |
|||
public class ModuloPowerOfTwoVariable |
|||
{ |
|||
private readonly int value = 42; |
|||
|
|||
private readonly int m = 32; |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public int Standard() |
|||
{ |
|||
return this.value % this.m; |
|||
} |
|||
|
|||
[Benchmark] |
|||
public int Bitwise() |
|||
{ |
|||
return ImageMaths.ModuloP2(this.value, this.m); |
|||
} |
|||
|
|||
// RESULTS:
|
|||
//
|
|||
// Method | Mean | Error | StdDev | Median | Scaled | ScaledSD |
|
|||
// --------- |----------:|----------:|----------:|----------:|-------:|---------:|
|
|||
// Standard | 1.2465 ns | 0.0093 ns | 0.0455 ns | 1.2423 ns | 1.00 | 0.00 |
|
|||
// Bitwise | 0.0265 ns | 0.0103 ns | 0.0515 ns | 0.0000 ns | 0.02 | 0.04 |
|
|||
} |
|||
} |
|||
@ -1,7 +1,8 @@ |
|||
using System; |
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.General |
|||
namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath |
|||
{ |
|||
public class Pow |
|||
{ |
|||
@ -1,19 +0,0 @@ |
|||
namespace SixLabors.ImageSharp.Benchmarks.General |
|||
{ |
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
public class Modulus |
|||
{ |
|||
[Benchmark(Baseline = true, Description = "Standard Modulus using %")] |
|||
public int StandardModulus() |
|||
{ |
|||
return 255 % 256; |
|||
} |
|||
|
|||
[Benchmark(Description = "Bitwise Modulus using &")] |
|||
public int BitwiseModulus() |
|||
{ |
|||
return 255 & 255; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,113 @@ |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization |
|||
{ |
|||
[Config(typeof(Config.ShortClr))] |
|||
public class UInt32ToSingle |
|||
{ |
|||
private float[] data; |
|||
|
|||
private const int Count = 32; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.data = new float[Count]; |
|||
} |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public void MagicMethod() |
|||
{ |
|||
ref Vector<float> b = ref Unsafe.As<float, Vector<float>>(ref this.data[0]); |
|||
|
|||
int n = Count / Vector<float>.Count; |
|||
|
|||
var bVec = new Vector<float>(256.0f / 255.0f); |
|||
var magicFloat = new Vector<float>(32768.0f); |
|||
var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
|
|||
var mask = new Vector<uint>(255); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
// union { float f; uint32_t i; } u;
|
|||
// u.f = 32768.0f + x * (255.0f / 256.0f);
|
|||
// return (uint8_t)u.i;
|
|||
|
|||
ref Vector<float> df = ref Unsafe.Add(ref b, i); |
|||
|
|||
var vi = Vector.AsVectorUInt32(df); |
|||
vi &= mask; |
|||
vi |= magicInt; |
|||
|
|||
var vf = Vector.AsVectorSingle(vi); |
|||
vf = (vf - magicFloat) * bVec; |
|||
|
|||
df = vf; |
|||
} |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void StandardSimd() |
|||
{ |
|||
int n = Count / Vector<float>.Count; |
|||
|
|||
ref Vector<float> bf = ref Unsafe.As<float, Vector<float>>(ref this.data[0]); |
|||
ref Vector<uint> bu = ref Unsafe.As<Vector<float>, Vector<uint>>(ref bf); |
|||
|
|||
var scale = new Vector<float>(1f / 255f); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
Vector<uint> u = Unsafe.Add(ref bu, i); |
|||
Vector<float> v = Vector.ConvertToSingle(u); |
|||
v *= scale; |
|||
Unsafe.Add(ref bf, i) = v; |
|||
} |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void StandardSimdFromInt() |
|||
{ |
|||
int n = Count / Vector<float>.Count; |
|||
|
|||
ref Vector<float> bf = ref Unsafe.As<float, Vector<float>>(ref this.data[0]); |
|||
ref Vector<int> bu = ref Unsafe.As<Vector<float>, Vector<int>>(ref bf); |
|||
|
|||
var scale = new Vector<float>(1f / 255f); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
Vector<int> u = Unsafe.Add(ref bu, i); |
|||
Vector<float> v = Vector.ConvertToSingle(u); |
|||
v *= scale; |
|||
Unsafe.Add(ref bf, i) = v; |
|||
} |
|||
} |
|||
|
|||
|
|||
[Benchmark] |
|||
public void StandardSimdFromInt_RefCast() |
|||
{ |
|||
int n = Count / Vector<float>.Count; |
|||
|
|||
ref Vector<float> bf = ref Unsafe.As<float, Vector<float>>(ref this.data[0]); |
|||
ref Vector<int> bu = ref Unsafe.As<Vector<float>, Vector<int>>(ref bf); |
|||
|
|||
var scale = new Vector<float>(1f / 255f); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
ref Vector<float> fRef = ref Unsafe.Add(ref bf, i); |
|||
|
|||
Vector<int> du = Vector.AsVectorInt32(fRef); |
|||
Vector<float> v = Vector.ConvertToSingle(du); |
|||
v *= scale; |
|||
|
|||
fRef = v; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,64 @@ |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
|
|||
using SixLabors.ImageSharp.Tuples; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization |
|||
{ |
|||
[Config(typeof(Config.ShortClr))] |
|||
public class WidenBytesToUInt32 |
|||
{ |
|||
private byte[] source; |
|||
|
|||
private uint[] dest; |
|||
|
|||
private const int Count = 64; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new byte[Count]; |
|||
this.dest = new uint[Count]; |
|||
} |
|||
|
|||
[Benchmark(Baseline = true)] |
|||
public void Standard() |
|||
{ |
|||
const int N = Count / 8; |
|||
|
|||
ref Octet.OfByte sBase = ref Unsafe.As<byte, Octet.OfByte>(ref this.source[0]); |
|||
ref Octet.OfUInt32 dBase = ref Unsafe.As<uint, Octet.OfUInt32>(ref this.dest[0]); |
|||
|
|||
for (int i = 0; i < N; i++) |
|||
{ |
|||
Unsafe.Add(ref dBase, i).LoadFrom(ref Unsafe.Add(ref sBase, i)); |
|||
} |
|||
} |
|||
|
|||
[Benchmark] |
|||
public void Simd() |
|||
{ |
|||
int n = Count / Vector<byte>.Count; |
|||
|
|||
ref Vector<byte> sBase = ref Unsafe.As<byte, Vector<byte>>(ref this.source[0]); |
|||
ref Vector<uint> dBase = ref Unsafe.As<uint, Vector<uint>>(ref this.dest[0]); |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
Vector<byte> b = Unsafe.Add(ref sBase, i); |
|||
|
|||
Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1); |
|||
Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1); |
|||
Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3); |
|||
|
|||
ref Vector<uint> d = ref Unsafe.Add(ref dBase, i * 4); |
|||
d = w0; |
|||
Unsafe.Add(ref d, 1) = w1; |
|||
Unsafe.Add(ref d, 2) = w2; |
|||
Unsafe.Add(ref d, 3) = w3; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
File diff suppressed because it is too large
Loading…
Reference in new issue