mirror of https://github.com/SixLabors/ImageSharp
13 changed files with 537 additions and 290 deletions
@ -0,0 +1,212 @@ |
|||||
|
// Copyright (c) Six Labors and contributors.
|
||||
|
// Licensed under the Apache License, Version 2.0.
|
||||
|
|
||||
|
using System; |
||||
|
using System.Diagnostics; |
||||
|
using System.Numerics; |
||||
|
using System.Runtime.CompilerServices; |
||||
|
using System.Runtime.InteropServices; |
||||
|
using SixLabors.ImageSharp.Tuples; |
||||
|
|
||||
|
// ReSharper disable MemberHidesStaticFromOuterClass
|
||||
|
namespace SixLabors.ImageSharp |
||||
|
{ |
||||
|
internal static partial class SimdUtils |
||||
|
{ |
||||
|
/// <summary>
|
||||
|
/// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*)
|
||||
|
/// </summary>
|
||||
|
public static class BasicIntrinsics256 |
||||
|
{ |
||||
|
public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture; |
||||
|
|
||||
|
/// <summary>
|
||||
|
/// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
|
||||
|
/// </summary>
|
||||
|
internal static void BulkConvertByteToNormalizedFloatReduce( |
||||
|
ref ReadOnlySpan<byte> source, |
||||
|
ref Span<float> dest) |
||||
|
{ |
||||
|
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); |
||||
|
|
||||
|
if (IsAvailable) |
||||
|
{ |
||||
|
int remainder = source.Length % 8; |
||||
|
int alignedCount = source.Length - remainder; |
||||
|
|
||||
|
if (alignedCount > 0) |
||||
|
{ |
||||
|
BulkConvertByteToNormalizedFloat( |
||||
|
source.Slice(0, alignedCount), |
||||
|
dest.Slice(0, alignedCount)); |
||||
|
|
||||
|
source = source.Slice(alignedCount); |
||||
|
dest = dest.Slice(alignedCount); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/// <summary>
|
||||
|
/// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source'
|
||||
|
/// into 'dest' buffer of <see cref="byte"/>. The values are scaled up into [0-255] and rounded.
|
||||
|
/// The implementation is SIMD optimized and works only with `source.Length` divisible by 8/>.
|
||||
|
/// Based on:
|
||||
|
/// <see>
|
||||
|
/// <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
|
||||
|
/// </see>
|
||||
|
/// </summary>
|
||||
|
internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest) |
||||
|
{ |
||||
|
GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); |
||||
|
|
||||
|
DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!"); |
||||
|
|
||||
|
if (source.Length == 0) |
||||
|
{ |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source)); |
||||
|
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest)); |
||||
|
int n = source.Length / 8; |
||||
|
|
||||
|
Vector<float> magick = new Vector<float>(32768.0f); |
||||
|
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f); |
||||
|
|
||||
|
// need to copy to a temporary struct, because
|
||||
|
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
|
||||
|
// does not work. TODO: This might be a CoreClr bug, need to ask/report
|
||||
|
var temp = default(Octet.OfUInt32); |
||||
|
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp); |
||||
|
|
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
// union { float f; uint32_t i; } u;
|
||||
|
// u.f = 32768.0f + x * (255.0f / 256.0f);
|
||||
|
// return (uint8_t)u.i;
|
||||
|
Vector<float> x = Unsafe.Add(ref srcBase, i); |
||||
|
x = (x * scale) + magick; |
||||
|
tempRef = x; |
||||
|
|
||||
|
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); |
||||
|
d.LoadFrom(ref temp); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/// <summary>
|
||||
|
/// SIMD optimized implementation for <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>.
|
||||
|
/// Works only with `dest.Length` divisible by 8.
|
||||
|
/// Implementation adapted from:
|
||||
|
/// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
|
||||
|
/// http://stackoverflow.com/a/536278
|
||||
|
/// </summary>
|
||||
|
internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest) |
||||
|
{ |
||||
|
GuardAvx2(nameof(BulkConvertByteToNormalizedFloat)); |
||||
|
|
||||
|
DebugGuard.IsTrue((dest.Length % 8) == 0, nameof(source), "dest.Length should be divisable by 8!"); |
||||
|
|
||||
|
var bVec = new Vector<float>(256.0f / 255.0f); |
||||
|
var magicFloat = new Vector<float>(32768.0f); |
||||
|
var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
|
||||
|
var mask = new Vector<uint>(255); |
||||
|
|
||||
|
ref Octet.OfByte sourceBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(source)); |
||||
|
ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As<float, Octet.OfUInt32>(ref MemoryMarshal.GetReference(dest)); |
||||
|
|
||||
|
ref Vector<float> destBaseAsFloat = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref destBaseAsWideOctet); |
||||
|
|
||||
|
int n = dest.Length / 8; |
||||
|
|
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); |
||||
|
ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i); |
||||
|
d.LoadFrom(ref s); |
||||
|
} |
||||
|
|
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
ref Vector<float> df = ref Unsafe.Add(ref destBaseAsFloat, i); |
||||
|
|
||||
|
var vi = Vector.AsVectorUInt32(df); |
||||
|
vi &= mask; |
||||
|
vi |= magicInt; |
||||
|
|
||||
|
var vf = Vector.AsVectorSingle(vi); |
||||
|
vf = (vf - magicFloat) * bVec; |
||||
|
|
||||
|
df = vf; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/// <summary>
|
||||
|
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
|
||||
|
/// </summary>
|
||||
|
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( |
||||
|
ref ReadOnlySpan<float> source, |
||||
|
ref Span<byte> dest) |
||||
|
{ |
||||
|
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); |
||||
|
|
||||
|
if (IsAvailable) |
||||
|
{ |
||||
|
int remainder = source.Length % Vector<byte>.Count; |
||||
|
int alignedCount = source.Length - remainder; |
||||
|
|
||||
|
if (alignedCount > 0) |
||||
|
{ |
||||
|
BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount)); |
||||
|
|
||||
|
source = source.Slice(alignedCount); |
||||
|
dest = dest.Slice(alignedCount); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/// <summary>
|
||||
|
/// Same as <see cref="BulkConvertNormalizedFloatToByte"/> but clamps overflown values before conversion.
|
||||
|
/// </summary>
|
||||
|
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest) |
||||
|
{ |
||||
|
GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows)); |
||||
|
|
||||
|
DebugGuard.IsTrue((source.Length % 8) == 0, nameof(source), "source.Length should be divisible by 8!"); |
||||
|
|
||||
|
if (source.Length == 0) |
||||
|
{ |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source)); |
||||
|
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest)); |
||||
|
int n = source.Length / 8; |
||||
|
|
||||
|
Vector<float> magick = new Vector<float>(32768.0f); |
||||
|
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f); |
||||
|
|
||||
|
// need to copy to a temporary struct, because
|
||||
|
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
|
||||
|
// does not work. TODO: This might be a CoreClr bug, need to ask/report
|
||||
|
var temp = default(Octet.OfUInt32); |
||||
|
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp); |
||||
|
|
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
// union { float f; uint32_t i; } u;
|
||||
|
// u.f = 32768.0f + x * (255.0f / 256.0f);
|
||||
|
// return (uint8_t)u.i;
|
||||
|
Vector<float> x = Unsafe.Add(ref srcBase, i); |
||||
|
x = Vector.Max(x, Vector<float>.Zero); |
||||
|
x = Vector.Min(x, Vector<float>.One); |
||||
|
|
||||
|
x = (x * scale) + magick; |
||||
|
tempRef = x; |
||||
|
|
||||
|
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); |
||||
|
d.LoadFrom(ref temp); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,100 @@ |
|||||
|
using System.Runtime.CompilerServices; |
||||
|
using System.Runtime.InteropServices; |
||||
|
|
||||
|
namespace SixLabors.ImageSharp.Tuples |
||||
|
{ |
||||
|
internal static class Octet |
||||
|
{ |
||||
|
[StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))] |
||||
|
public struct OfUInt32 |
||||
|
{ |
||||
|
[FieldOffset(0 * sizeof(uint))] |
||||
|
public uint V0; |
||||
|
|
||||
|
[FieldOffset(1 * sizeof(uint))] |
||||
|
public uint V1; |
||||
|
|
||||
|
[FieldOffset(2 * sizeof(uint))] |
||||
|
public uint V2; |
||||
|
|
||||
|
[FieldOffset(3 * sizeof(uint))] |
||||
|
public uint V3; |
||||
|
|
||||
|
[FieldOffset(4 * sizeof(uint))] |
||||
|
public uint V4; |
||||
|
|
||||
|
[FieldOffset(5 * sizeof(uint))] |
||||
|
public uint V5; |
||||
|
|
||||
|
[FieldOffset(6 * sizeof(uint))] |
||||
|
public uint V6; |
||||
|
|
||||
|
[FieldOffset(7 * sizeof(uint))] |
||||
|
public uint V7; |
||||
|
|
||||
|
public override string ToString() |
||||
|
{ |
||||
|
return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; |
||||
|
} |
||||
|
|
||||
|
[MethodImpl(InliningOptions.ShortMethod)] |
||||
|
public void LoadFrom(ref OfByte src) |
||||
|
{ |
||||
|
this.V0 = src.V0; |
||||
|
this.V1 = src.V1; |
||||
|
this.V2 = src.V2; |
||||
|
this.V3 = src.V3; |
||||
|
this.V4 = src.V4; |
||||
|
this.V5 = src.V5; |
||||
|
this.V6 = src.V6; |
||||
|
this.V7 = src.V7; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
[StructLayout(LayoutKind.Explicit, Size = 8)] |
||||
|
public struct OfByte |
||||
|
{ |
||||
|
[FieldOffset(0)] |
||||
|
public byte V0; |
||||
|
|
||||
|
[FieldOffset(1)] |
||||
|
public byte V1; |
||||
|
|
||||
|
[FieldOffset(2)] |
||||
|
public byte V2; |
||||
|
|
||||
|
[FieldOffset(3)] |
||||
|
public byte V3; |
||||
|
|
||||
|
[FieldOffset(4)] |
||||
|
public byte V4; |
||||
|
|
||||
|
[FieldOffset(5)] |
||||
|
public byte V5; |
||||
|
|
||||
|
[FieldOffset(6)] |
||||
|
public byte V6; |
||||
|
|
||||
|
[FieldOffset(7)] |
||||
|
public byte V7; |
||||
|
|
||||
|
public override string ToString() |
||||
|
{ |
||||
|
return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; |
||||
|
} |
||||
|
|
||||
|
[MethodImpl(InliningOptions.ShortMethod)] |
||||
|
public void LoadFrom(ref OfUInt32 src) |
||||
|
{ |
||||
|
this.V0 = (byte)src.V0; |
||||
|
this.V1 = (byte)src.V1; |
||||
|
this.V2 = (byte)src.V2; |
||||
|
this.V3 = (byte)src.V3; |
||||
|
this.V4 = (byte)src.V4; |
||||
|
this.V5 = (byte)src.V5; |
||||
|
this.V6 = (byte)src.V6; |
||||
|
this.V7 = (byte)src.V7; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue