Browse Source

Bulk conversion of arbitrary-sized Span-s of scalars

af/merge-core
Anton Firszov 8 years ago
parent
commit
81c57a812d
  1. 212
      src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
  2. 90
      src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
  3. 255
      src/ImageSharp/Common/Helpers/SimdUtils.cs
  4. 100
      src/ImageSharp/Common/Tuples/Octet.cs
  5. 2
      src/ImageSharp/Common/Tuples/Vector4Pair.cs
  6. 2
      src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs
  7. 2
      src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
  8. 2
      src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
  9. 12
      src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
  10. 8
      tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
  11. 27
      tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs
  12. 7
      tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs
  13. 108
      tests/ImageSharp.Tests/Common/SimdUtilsTests.cs

212
src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs

@ -0,0 +1,212 @@
// Copyright (c) Six Labors and contributors.
// Licensed under the Apache License, Version 2.0.
using System;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Tuples;
// ReSharper disable MemberHidesStaticFromOuterClass
namespace SixLabors.ImageSharp
{
internal static partial class SimdUtils
{
/// <summary>
/// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*)
/// </summary>
public static class BasicIntrinsics256
{
public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture;
/// <summary>
/// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
/// </summary>
internal static void BulkConvertByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
if (IsAvailable)
{
int remainder = source.Length % 8;
int alignedCount = source.Length - remainder;
if (alignedCount > 0)
{
BulkConvertByteToNormalizedFloat(
source.Slice(0, alignedCount),
dest.Slice(0, alignedCount));
source = source.Slice(alignedCount);
dest = dest.Slice(alignedCount);
}
}
}
/// <summary>
/// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source'
/// into 'dest' buffer of <see cref="byte"/>. The values are scaled up into [0-255] and rounded.
/// The implementation is SIMD optimized and works only with `source.Length` divisible by 8/>.
/// Based on:
/// <see>
/// <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
/// </see>
/// </summary>
internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
{
GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
if (source.Length == 0)
{
return;
}
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
int n = source.Length / 8;
Vector<float> magick = new Vector<float>(32768.0f);
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
// need to copy to a temporary struct, because
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
// does not work. TODO: This might be a CoreClr bug, need to ask/report
var temp = default(Octet.OfUInt32);
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
for (int i = 0; i < n; i++)
{
// union { float f; uint32_t i; } u;
// u.f = 32768.0f + x * (255.0f / 256.0f);
// return (uint8_t)u.i;
Vector<float> x = Unsafe.Add(ref srcBase, i);
x = (x * scale) + magick;
tempRef = x;
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
d.LoadFrom(ref temp);
}
}
/// <summary>
/// SIMD optimized implementation for <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>.
/// Works only with `dest.Length` divisible by 8.
/// Implementation adapted from:
/// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
/// http://stackoverflow.com/a/536278
/// </summary>
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
DebugGuard.IsTrue((dest.Length % 8) == 0, nameof(source), "dest.Length should be divisable by 8!");
var bVec = new Vector<float>(256.0f / 255.0f);
var magicFloat = new Vector<float>(32768.0f);
var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
var mask = new Vector<uint>(255);
ref Octet.OfByte sourceBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(source));
ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As<float, Octet.OfUInt32>(ref MemoryMarshal.GetReference(dest));
ref Vector<float> destBaseAsFloat = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref destBaseAsWideOctet);
int n = dest.Length / 8;
for (int i = 0; i < n; i++)
{
ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i);
ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i);
d.LoadFrom(ref s);
}
for (int i = 0; i < n; i++)
{
ref Vector<float> df = ref Unsafe.Add(ref destBaseAsFloat, i);
var vi = Vector.AsVectorUInt32(df);
vi &= mask;
vi |= magicInt;
var vf = Vector.AsVectorSingle(vi);
vf = (vf - magicFloat) * bVec;
df = vf;
}
}
/// <summary>
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
/// </summary>
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
if (IsAvailable)
{
int remainder = source.Length % Vector<byte>.Count;
int alignedCount = source.Length - remainder;
if (alignedCount > 0)
{
BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
source = source.Slice(alignedCount);
dest = dest.Slice(alignedCount);
}
}
}
/// <summary>
/// Same as <see cref="BulkConvertNormalizedFloatToByte"/> but clamps overflown values before conversion.
/// </summary>
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
{
GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
DebugGuard.IsTrue((source.Length % 8) == 0, nameof(source), "source.Length should be divisible by 8!");
if (source.Length == 0)
{
return;
}
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
int n = source.Length / 8;
Vector<float> magick = new Vector<float>(32768.0f);
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
// need to copy to a temporary struct, because
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
// does not work. TODO: This might be a CoreClr bug, need to ask/report
var temp = default(Octet.OfUInt32);
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
for (int i = 0; i < n; i++)
{
// union { float f; uint32_t i; } u;
// u.f = 32768.0f + x * (255.0f / 256.0f);
// return (uint8_t)u.i;
Vector<float> x = Unsafe.Add(ref srcBase, i);
x = Vector.Max(x, Vector<float>.Zero);
x = Vector.Min(x, Vector<float>.One);
x = (x * scale) + magick;
tempRef = x;
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
d.LoadFrom(ref temp);
}
}
}
}
}

90
src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs

@ -1,8 +1,10 @@
using System;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
// ReSharper disable MemberHidesStaticFromOuterClass
namespace SixLabors.ImageSharp
{
internal static partial class SimdUtils
@ -18,22 +20,47 @@ namespace SixLabors.ImageSharp
{
public static bool IsAvailable { get; } =
#if NETCOREAPP2_1
// TODO: Also available in .NET 4.7.2, we need to add a build target!
true;
// TODO: Also available in .NET 4.7.2, we need to add a build target!
Vector.IsHardwareAccelerated;
#else
false;
#endif
/// <summary>
/// A variant of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>, which is faster on new .NET runtime.
/// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
/// </summary>
[Conditional("NETCOREAPP2_1")]
internal static void BulkConvertByteToNormalizedFloatReduce(
ref ReadOnlySpan<byte> source,
ref Span<float> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
if (IsAvailable)
{
int remainder = source.Length % Vector<byte>.Count;
int alignedCount = source.Length - remainder;
if (alignedCount > 0)
{
BulkConvertByteToNormalizedFloat(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
source = source.Slice(alignedCount);
dest = dest.Slice(alignedCount);
}
}
}
/// <summary>
/// A variant of <see cref="BasicIntrinsics256.BulkConvertByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
/// </summary>
// ReSharper disable once MemberHidesStaticFromOuterClass
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
Guard.IsTrue(
DebugGuard.IsTrue(
dest.Length % Vector<byte>.Count == 0,
nameof(source),
"dest.Length should be divisable by Vector<byte>.Count!");
"dest.Length should be divisible by Vector<byte>.Count!");
int n = dest.Length / Vector<byte>.Count;
@ -63,34 +90,52 @@ namespace SixLabors.ImageSharp
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<float> ConvertToSingle(Vector<uint> u, Vector<float> scale)
/// <summary>
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
/// </summary>
[Conditional("NETCOREAPP2_1")]
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
Vector<int> vi = Vector.AsVectorInt32(u);
Vector<float> v = Vector.ConvertToSingle(vi);
v *= scale;
return v;
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
if (IsAvailable)
{
int remainder = source.Length % Vector<byte>.Count;
int alignedCount = source.Length - remainder;
if (alignedCount > 0)
{
BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
source = source.Slice(alignedCount);
dest = dest.Slice(alignedCount);
}
}
}
/// <summary>
/// A variant of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
/// A variant of <see cref="BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
/// </summary>
/// <remarks>
/// It does NOT worth yet to utilize this method (2018 Oct).
/// See benchmark results for the "PackFromVector4_Rgba32" benchmark!
/// TODO: Check again later!
/// </remarks>
// ReSharper disable once MemberHidesStaticFromOuterClass
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
internal static void BulkConvertNormalizedFloatToByteClampOverflows(
ReadOnlySpan<float> source,
Span<byte> dest)
{
Guard.IsTrue(
DebugGuard.IsTrue(
dest.Length % Vector<byte>.Count == 0,
nameof(dest),
"dest.Length should be divisable by Vector<byte>.Count!");
"dest.Length should be divisible by Vector<byte>.Count!");
int n = dest.Length / Vector<byte>.Count;
ref Vector<float> sourceBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
ref Vector<float> sourceBase =
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
ref Vector<byte> destBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(dest));
for (int i = 0; i < n; i++)
@ -126,6 +171,15 @@ namespace SixLabors.ImageSharp
Vector<int> vi = Vector.ConvertToInt32(vf);
return Vector.AsVectorUInt32(vi);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector<float> ConvertToSingle(Vector<uint> u, Vector<float> scale)
{
Vector<int> vi = Vector.AsVectorInt32(u);
Vector<float> v = Vector.ConvertToSingle(vi);
v *= scale;
return v;
}
}
}
}
}

255
src/ImageSharp/Common/Helpers/SimdUtils.cs

@ -6,6 +6,9 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.PixelFormats;
using SixLabors.ImageSharp.Tuples;
namespace SixLabors.ImageSharp
{
/// <summary>
@ -16,7 +19,8 @@ namespace SixLabors.ImageSharp
/// <summary>
/// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte.
/// </summary>
public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;
public static bool IsAvx2CompatibleArchitecture { get; } =
Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;
internal static void GuardAvx2(string operation)
{
@ -57,236 +61,61 @@ namespace SixLabors.ImageSharp
}
/// <summary>
/// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/> values.
/// The values are scaled up into [0-255] and rounded.
/// The implementation is SIMD optimized and works only with `source.Length` divisible by <see cref="Vector{UInt32}.Count"/>.
/// Based on:
/// <see>
/// <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
/// </see>
/// </summary>
internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
{
GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
if (source.Length == 0)
{
return;
}
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
int n = source.Length / 8;
Vector<float> magick = new Vector<float>(32768.0f);
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
// need to copy to a temporary struct, because
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
// does not work. TODO: This might be a CoreClr bug, need to ask/report
var temp = default(Octet.OfUInt32);
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
for (int i = 0; i < n; i++)
{
// union { float f; uint32_t i; } u;
// u.f = 32768.0f + x * (255.0f / 256.0f);
// return (uint8_t)u.i;
Vector<float> x = Unsafe.Add(ref srcBase, i);
x = (x * scale) + magick;
tempRef = x;
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
d.LoadFrom(ref temp);
}
}
/// <summary>
/// Converts `dest.Length` bytes to <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1]
/// The implementation is SIMD optimized and works only with `dest.Length` divisible by <see cref="Vector{UInt32}.Count"/>.
/// Implementation adapted from:
/// <see>
/// <cref>http://stackoverflow.com/a/5362789</cref>
/// </see>
/// Converts `dest.Length` <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
/// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
/// but there are no restrictions on the span's length.
/// </summary>
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
{
GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
DebugGuard.IsTrue((dest.Length % Vector<float>.Count) == 0, nameof(source), "dest.Length should be divisable by Vector<float>.Count!");
var bVec = new Vector<float>(256.0f / 255.0f);
var magicFloat = new Vector<float>(32768.0f);
var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
var mask = new Vector<uint>(255);
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
ref Octet.OfByte sourceBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(source));
ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As<float, Octet.OfUInt32>(ref MemoryMarshal.GetReference(dest));
ref Vector<float> destBaseAsFloat = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref destBaseAsWideOctet);
int n = dest.Length / 8;
for (int i = 0; i < n; i++)
{
ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i);
ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i);
d.LoadFrom(ref s);
}
ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
for (int i = 0; i < n; i++)
// Deal with the remainder:
int count = source.Length;
if (count > 0)
{
ref Vector<float> df = ref Unsafe.Add(ref destBaseAsFloat, i);
var vi = Vector.AsVectorUInt32(df);
vi &= mask;
vi |= magicInt;
var vf = Vector.AsVectorSingle(vi);
vf = (vf - magicFloat) * bVec;
df = vf;
// TODO: Do we need to optimize anything on this? (There are at most 7 remainders)
ref byte sBase = ref MemoryMarshal.GetReference(source);
ref float dBase = ref MemoryMarshal.GetReference(dest);
for (int i = 0; i < count; i++)
{
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, i) / 255f;
}
}
}
/// <summary>
/// Same as <see cref="BulkConvertNormalizedFloatToByte"/> but clamps overflown values before conversion.
/// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
/// The values are scaled up into [0-255] and rounded, overflows are clamped.
/// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
/// but there are no restrictions on the span's length.
/// </summary>
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
{
GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
if (source.Length == 0)
{
return;
}
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
int n = source.Length / 8;
Vector<float> magick = new Vector<float>(32768.0f);
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
// need to copy to a temporary struct, because
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
// does not work. TODO: This might be a CoreClr bug, need to ask/report
var temp = default(Octet.OfUInt32);
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
for (int i = 0; i < n; i++)
{
// union { float f; uint32_t i; } u;
// u.f = 32768.0f + x * (255.0f / 256.0f);
// return (uint8_t)u.i;
Vector<float> x = Unsafe.Add(ref srcBase, i);
x = Vector.Max(x, Vector<float>.Zero);
x = Vector.Min(x, Vector<float>.One);
x = (x * scale) + magick;
tempRef = x;
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
d.LoadFrom(ref temp);
}
}
// TODO: Replace these with T4-d library level tuples!
internal static class Octet
{
[StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))]
public struct OfUInt32
{
[FieldOffset(0 * sizeof(uint))]
public uint V0;
[FieldOffset(1 * sizeof(uint))]
public uint V1;
[FieldOffset(2 * sizeof(uint))]
public uint V2;
[FieldOffset(3 * sizeof(uint))]
public uint V3;
[FieldOffset(4 * sizeof(uint))]
public uint V4;
[FieldOffset(5 * sizeof(uint))]
public uint V5;
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
[FieldOffset(6 * sizeof(uint))]
public uint V6;
ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
[FieldOffset(7 * sizeof(uint))]
public uint V7;
public override string ToString()
{
return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
}
[MethodImpl(InliningOptions.ShortMethod)]
public void LoadFrom(ref OfByte src)
{
this.V0 = src.V0;
this.V1 = src.V1;
this.V2 = src.V2;
this.V3 = src.V3;
this.V4 = src.V4;
this.V5 = src.V5;
this.V6 = src.V6;
this.V7 = src.V7;
}
}
[StructLayout(LayoutKind.Explicit, Size = 8)]
public struct OfByte
// Deal with the remainder:
int count = source.Length;
if (count > 0)
{
[FieldOffset(0)]
public byte V0;
[FieldOffset(1)]
public byte V1;
[FieldOffset(2)]
public byte V2;
[FieldOffset(3)]
public byte V3;
[FieldOffset(4)]
public byte V4;
[FieldOffset(5)]
public byte V5;
[FieldOffset(6)]
public byte V6;
[FieldOffset(7)]
public byte V7;
public override string ToString()
{
return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
}
ref float sBase = ref MemoryMarshal.GetReference(source);
ref byte dBase = ref MemoryMarshal.GetReference(dest);
[MethodImpl(InliningOptions.ShortMethod)]
public void LoadFrom(ref OfUInt32 src)
for (int i = 0; i < count; i++)
{
this.V0 = (byte)src.V0;
this.V1 = (byte)src.V1;
this.V2 = (byte)src.V2;
this.V3 = (byte)src.V3;
this.V4 = (byte)src.V4;
this.V5 = (byte)src.V5;
this.V6 = (byte)src.V6;
this.V7 = (byte)src.V7;
// TODO: Do we need to optimize anything on this? (There are at most 7 remainders)
float f = Unsafe.Add(ref sBase, i);
f *= 255f;
f += 0.5f;
f = MathF.Max(0, f);
f = MathF.Min(255f, f);
Unsafe.Add(ref dBase, i) = (byte)f;
}
}
}

100
src/ImageSharp/Common/Tuples/Octet.cs

@ -0,0 +1,100 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace SixLabors.ImageSharp.Tuples
{
internal static class Octet
{
[StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))]
public struct OfUInt32
{
[FieldOffset(0 * sizeof(uint))]
public uint V0;
[FieldOffset(1 * sizeof(uint))]
public uint V1;
[FieldOffset(2 * sizeof(uint))]
public uint V2;
[FieldOffset(3 * sizeof(uint))]
public uint V3;
[FieldOffset(4 * sizeof(uint))]
public uint V4;
[FieldOffset(5 * sizeof(uint))]
public uint V5;
[FieldOffset(6 * sizeof(uint))]
public uint V6;
[FieldOffset(7 * sizeof(uint))]
public uint V7;
public override string ToString()
{
return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
}
[MethodImpl(InliningOptions.ShortMethod)]
public void LoadFrom(ref OfByte src)
{
this.V0 = src.V0;
this.V1 = src.V1;
this.V2 = src.V2;
this.V3 = src.V3;
this.V4 = src.V4;
this.V5 = src.V5;
this.V6 = src.V6;
this.V7 = src.V7;
}
}
[StructLayout(LayoutKind.Explicit, Size = 8)]
public struct OfByte
{
[FieldOffset(0)]
public byte V0;
[FieldOffset(1)]
public byte V1;
[FieldOffset(2)]
public byte V2;
[FieldOffset(3)]
public byte V3;
[FieldOffset(4)]
public byte V4;
[FieldOffset(5)]
public byte V5;
[FieldOffset(6)]
public byte V6;
[FieldOffset(7)]
public byte V7;
public override string ToString()
{
return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
}
[MethodImpl(InliningOptions.ShortMethod)]
public void LoadFrom(ref OfUInt32 src)
{
this.V0 = (byte)src.V0;
this.V1 = (byte)src.V1;
this.V2 = (byte)src.V2;
this.V3 = (byte)src.V3;
this.V4 = (byte)src.V4;
this.V5 = (byte)src.V5;
this.V6 = (byte)src.V6;
this.V7 = (byte)src.V7;
}
}
}
}

2
src/ImageSharp/Common/Tuples/Vector4Pair.cs

@ -2,7 +2,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace SixLabors.ImageSharp.Common.Tuples
namespace SixLabors.ImageSharp.Tuples
{
/// <summary>
/// Its faster to process multiple Vector4-s together, so let's pair them!

2
src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs

@ -6,7 +6,7 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Common.Tuples;
using SixLabors.ImageSharp.Tuples;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
{

2
src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs

@ -6,7 +6,7 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Common.Tuples;
using SixLabors.ImageSharp.Tuples;
// ReSharper disable ImpureMethodCallOnReadonlyValueField
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters

2
src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs

@ -6,8 +6,8 @@ using System.Collections.Generic;
using System.Linq;
using System.Numerics;
using SixLabors.ImageSharp.Common.Tuples;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.Tuples;
using SixLabors.Memory;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters

12
src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs

@ -37,7 +37,7 @@ namespace SixLabors.ImageSharp.PixelFormats
}
else
{
ConvertToVector4UsingStandardIntrinsics(sourceColors, destinationVectors, count);
ConvertToVector4UsingBasicIntrinsics(sourceColors, destinationVectors, count);
}
}
@ -58,7 +58,7 @@ namespace SixLabors.ImageSharp.PixelFormats
}
else
{
ConvertFromVector4StandardIntrinsics(sourceVectors, destinationColors, count);
ConvertFromVector4BasicIntrinsics(sourceVectors, destinationColors, count);
}
}
@ -112,7 +112,7 @@ namespace SixLabors.ImageSharp.PixelFormats
}
}
private static void ConvertToVector4UsingStandardIntrinsics(
private static void ConvertToVector4UsingBasicIntrinsics(
ReadOnlySpan<Rgba32> sourceColors,
Span<Vector4> destinationVectors,
int count)
@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.PixelFormats
ReadOnlySpan<byte> rawSrc = MemoryMarshal.Cast<Rgba32, byte>(sourceColors);
Span<float> rawDest = MemoryMarshal.Cast<Vector4, float>(destinationVectors.Slice(0, alignedCount));
SimdUtils.BulkConvertByteToNormalizedFloat(rawSrc, rawDest);
SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(rawSrc, rawDest);
}
if (remainder > 0)
@ -155,7 +155,7 @@ namespace SixLabors.ImageSharp.PixelFormats
}
}
private static void ConvertFromVector4StandardIntrinsics(ReadOnlySpan<Vector4> sourceVectors, Span<Rgba32> destinationColors, int count)
private static void ConvertFromVector4BasicIntrinsics(ReadOnlySpan<Vector4> sourceVectors, Span<Rgba32> destinationColors, int count)
{
int remainder = count % 2;
int alignedCount = count - remainder;
@ -165,7 +165,7 @@ namespace SixLabors.ImageSharp.PixelFormats
ReadOnlySpan<float> rawSrc = MemoryMarshal.Cast<Vector4, float>(sourceVectors.Slice(0, alignedCount));
Span<byte> rawDest = MemoryMarshal.Cast<Rgba32, byte>(destinationColors);
SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
}
if (remainder > 0)

8
tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs

@ -30,8 +30,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
[Params(
//64,
//256,
//512,
2048
512
//1024
)]
public int Count { get; set; }
@ -117,7 +117,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats);
}
//[Benchmark]
[Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
@ -159,7 +159,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
}
}
//[Benchmark]
[Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());

27
tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs

@ -5,6 +5,7 @@ using BenchmarkDotNet.Attributes;
namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
{
[Config(typeof(Config.ShortClr))]
public class UInt32ToSingle
{
private float[] data;
@ -66,8 +67,7 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
Unsafe.Add(ref bf, i) = v;
}
}
// This code is not correct at all, it's just here as reference
[Benchmark]
public void StandardSimdFromInt()
{
@ -86,5 +86,28 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
Unsafe.Add(ref bf, i) = v;
}
}
[Benchmark]
public void StandardSimdFromInt_RefCast()
{
int n = Count / Vector<float>.Count;
ref Vector<float> bf = ref Unsafe.As<float, Vector<float>>(ref this.data[0]);
ref Vector<int> bu = ref Unsafe.As<Vector<float>, Vector<int>>(ref bf);
var scale = new Vector<float>(1f / 255f);
for (int i = 0; i < n; i++)
{
ref Vector<float> fRef = ref Unsafe.Add(ref bf, i);
Vector<int> du = Vector.AsVectorInt32(fRef);
Vector<float> v = Vector.ConvertToSingle(du);
v *= scale;
fRef = v;
}
}
}
}

7
tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs

@ -3,8 +3,11 @@ using System.Runtime.CompilerServices;
using BenchmarkDotNet.Attributes;
using SixLabors.ImageSharp.Tuples;
namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
{
[Config(typeof(Config.ShortClr))]
public class WidenBytesToUInt32
{
private byte[] source;
@ -25,8 +28,8 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
{
const int N = Count / 8;
ref SimdUtils.Octet.OfByte sBase = ref Unsafe.As<byte, SimdUtils.Octet.OfByte>(ref this.source[0]);
ref SimdUtils.Octet.OfUInt32 dBase = ref Unsafe.As<uint, SimdUtils.Octet.OfUInt32>(ref this.dest[0]);
ref Octet.OfByte sBase = ref Unsafe.As<byte, Octet.OfByte>(ref this.source[0]);
ref Octet.OfUInt32 dBase = ref Unsafe.As<uint, Octet.OfUInt32>(ref this.dest[0]);
for (int i = 0; i < N; i++)
{

108
tests/ImageSharp.Tests/Common/SimdUtilsTests.cs

@ -62,7 +62,7 @@ namespace SixLabors.ImageSharp.Tests.Common
{
float[] data = new float[Vector<float>.Count];
var rnd = new Random();
var rnd = new Random(seed);
for (int i = 0; i < Vector<float>.Count; i++)
{
@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common
[InlineData(1, 8)]
[InlineData(2, 16)]
[InlineData(3, 128)]
public void BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
{
if (this.SkipOnNonAvx2())
{
@ -130,7 +130,7 @@ namespace SixLabors.ImageSharp.Tests.Common
byte[] dest = new byte[count];
SimdUtils.BulkConvertNormalizedFloatToByte(normalized, dest);
SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(normalized, dest);
byte[] expected = orig.Select(f => (byte)(f)).ToArray();
@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common
[InlineData(1, 8)]
[InlineData(2, 16)]
[InlineData(3, 128)]
public void BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
{
if (this.SkipOnNonAvx2())
{
@ -153,87 +153,113 @@ namespace SixLabors.ImageSharp.Tests.Common
byte[] dest = new byte[count];
SimdUtils.BulkConvertNormalizedFloatToByte(source, dest);
SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(source, dest);
byte[] expected = source.Select(f => (byte)Math.Round(f * 255f)).ToArray();
Assert.Equal(expected, dest);
}
public static readonly TheoryData<int> ArraySizesDivisibleBy8 = new TheoryData<int> { 0, 8, 16, 1024 };
public static readonly TheoryData<int> ArraySizesDivisibleBy32 = new TheoryData<int> { 0, 32, 512 };
public static readonly TheoryData<int> ArbitraryArraySizes =
new TheoryData<int>
{
0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520,
};
[Theory]
[InlineData(1, 0)]
[InlineData(2, 32)]
[InlineData(3, 128)]
public void BulkConvertByteToNormalizedFloat(int seed, int count)
[MemberData(nameof(ArraySizesDivisibleBy8))]
public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count)
{
if (this.SkipOnNonAvx2())
{
return;
}
byte[] source = new Random(seed).GenerateRandomByteArray(count);
float[] result = new float[count];
float[] expected = source.Select(b => (float)b / 255f).ToArray();
SimdUtils.BulkConvertByteToNormalizedFloat(source, result);
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
TestImpl_BulkConvertByteToNormalizedFloat(
count,
(s, d) => SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
}
[Theory]
[InlineData(1, 0)]
[InlineData(2, 32)]
[InlineData(3, 128)]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count)
[MemberData(nameof(ArraySizesDivisibleBy32))]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int count)
{
TestImpl_BulkConvertByteToNormalizedFloat(
count,
(s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
}
[Theory]
[MemberData(nameof(ArbitraryArraySizes))]
public void BulkConvertByteToNormalizedFloat(int count)
{
TestImpl_BulkConvertByteToNormalizedFloat(
count,
(s, d) => SimdUtils.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
}
private static void TestImpl_BulkConvertByteToNormalizedFloat(
int count,
Action<Memory<byte>, Memory<float>> convert)
{
byte[] source = new Random(seed).GenerateRandomByteArray(count);
byte[] source = new Random(count).GenerateRandomByteArray(count);
float[] result = new float[count];
float[] expected = source.Select(b => (float)b / 255f).ToArray();
SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result);
convert(source, result);
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
}
public static readonly TheoryData<int> BulkConvertNormalizedFloatToByteClampOverflows_Data =
new TheoryData<int>
{
0, 64, 1024
};
[Theory]
[MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))]
public void BulkConvertNormalizedFloatToByteClampOverflows(int count)
[MemberData(nameof(ArraySizesDivisibleBy8))]
public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
if (this.SkipOnNonAvx2())
{
return;
}
float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f);
byte[] expected = source.Select(NormalizedFloatToByte).ToArray();
byte[] actual = new byte[count];
SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(source, actual);
Assert.Equal(expected, actual);
TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
(s, d) => SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
);
}
[Theory]
[MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))]
[MemberData(nameof(ArraySizesDivisibleBy32))]
public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
(s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
);
}
[Theory]
[MemberData(nameof(ArbitraryArraySizes))]
public void BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
(s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
);
}
private static void TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
int count,
Action<Memory<float>, Memory<byte>> convert)
{
float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f);
byte[] expected = source.Select(NormalizedFloatToByte).ToArray();
byte[] actual = new byte[count];
SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(source, actual);
convert(source, actual);
Assert.Equal(expected, actual);
}
private static byte NormalizedFloatToByte(float f) => (byte)Math.Min(255f, Math.Max(0f, f * 255f + 0.5f));
[Theory]

Loading…
Cancel
Save