mirror of https://github.com/SixLabors/ImageSharp
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
232 lines
8.6 KiB
232 lines
8.6 KiB
// Copyright (c) Six Labors and contributors.
|
|
// Licensed under the Apache License, Version 2.0.
|
|
|
|
using System;
|
|
using System.Diagnostics;
|
|
using System.Numerics;
|
|
using System.Runtime.CompilerServices;
|
|
using System.Runtime.InteropServices;
|
|
|
|
namespace SixLabors.ImageSharp
|
|
{
|
|
/// <summary>
|
|
/// Various extension and utility methods for <see cref="Vector4"/> and <see cref="Vector{T}"/> utilizing SIMD capabilities
|
|
/// </summary>
|
|
internal static class SimdUtils
|
|
{
|
|
/// <summary>
|
|
/// Indicates AVX2 architecture where both float and integer registers are of size 256 byte.
|
|
/// </summary>
|
|
public static readonly bool IsAvx2CompatibleArchitecture = Vector<float>.Count == 8 && Vector<int>.Count == 8;
|
|
|
|
internal static void GuardAvx2(string operation)
|
|
{
|
|
if (!IsAvx2CompatibleArchitecture)
|
|
{
|
|
throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Transform all scalars in 'v' in a way that converting them to <see cref="int"/> would have rounding semantics.
|
|
/// </summary>
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
|
internal static Vector4 PseudoRound(this Vector4 v)
|
|
{
|
|
var sign = Vector4.Clamp(v, new Vector4(-1), new Vector4(1));
|
|
|
|
return v + (sign * 0.5f);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Rounds all values in 'v' to the nearest integer following <see cref="MidpointRounding.ToEven"/> semantics.
|
|
/// Source:
|
|
/// <see>
|
|
/// <cref>https://github.com/g-truc/glm/blob/master/glm/simd/common.h#L110</cref>
|
|
/// </see>
|
|
/// </summary>
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
|
internal static Vector<float> FastRound(this Vector<float> x)
|
|
{
|
|
Vector<int> magic0 = new Vector<int>(int.MinValue); // 0x80000000
|
|
Vector<float> sgn0 = Vector.AsVectorSingle(magic0);
|
|
Vector<float> and0 = Vector.BitwiseAnd(sgn0, x);
|
|
Vector<float> or0 = Vector.BitwiseOr(and0, new Vector<float>(8388608.0f));
|
|
Vector<float> add0 = Vector.Add(x, or0);
|
|
Vector<float> sub0 = Vector.Subtract(add0, or0);
|
|
return sub0;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/> values.
|
|
/// The values gonna be scaled up into [0-255] and rounded.
|
|
/// Based on:
|
|
/// <see>
|
|
/// <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
|
|
/// </see>
|
|
/// </summary>
|
|
internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
|
|
{
|
|
GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
|
|
|
|
DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
|
|
|
|
if (source.Length == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref source.DangerousGetPinnableReference());
|
|
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref dest.DangerousGetPinnableReference());
|
|
int n = source.Length / 8;
|
|
|
|
Vector<float> magick = new Vector<float>(32768.0f);
|
|
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
|
|
|
|
// need to copy to a temporal struct, because
|
|
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
|
|
// does not work. TODO: This might be a CoreClr bug, need to ask/report
|
|
var temp = default(Octet.OfUInt32);
|
|
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
|
|
|
|
for (int i = 0; i < n; i++)
|
|
{
|
|
// union { float f; uint32_t i; } u;
|
|
// u.f = 32768.0f + x * (255.0f / 256.0f);
|
|
// return (uint8_t)u.i;
|
|
Vector<float> x = Unsafe.Add(ref srcBase, i);
|
|
x = (x * scale) + magick;
|
|
tempRef = x;
|
|
|
|
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
|
|
d.LoadFrom(ref temp);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Same as <see cref="BulkConvertNormalizedFloatToByte"/> but clamps overflown values before conversion.
|
|
/// </summary>
|
|
internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
|
|
{
|
|
GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
|
|
|
|
DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
|
|
|
|
if (source.Length == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref source.DangerousGetPinnableReference());
|
|
ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref dest.DangerousGetPinnableReference());
|
|
int n = source.Length / 8;
|
|
|
|
Vector<float> magick = new Vector<float>(32768.0f);
|
|
Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
|
|
|
|
// need to copy to a temporal struct, because
|
|
// SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
|
|
// does not work. TODO: This might be a CoreClr bug, need to ask/report
|
|
var temp = default(Octet.OfUInt32);
|
|
ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
|
|
|
|
for (int i = 0; i < n; i++)
|
|
{
|
|
// union { float f; uint32_t i; } u;
|
|
// u.f = 32768.0f + x * (255.0f / 256.0f);
|
|
// return (uint8_t)u.i;
|
|
Vector<float> x = Unsafe.Add(ref srcBase, i);
|
|
x = Vector.Max(x, Vector<float>.Zero);
|
|
x = Vector.Min(x, Vector<float>.One);
|
|
|
|
x = (x * scale) + magick;
|
|
tempRef = x;
|
|
|
|
ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
|
|
d.LoadFrom(ref temp);
|
|
}
|
|
}
|
|
|
|
// TODO: Replace these with T4-d library level tuples!
|
|
internal static class Octet
|
|
{
|
|
[StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))]
|
|
public struct OfUInt32
|
|
{
|
|
[FieldOffset(0 * sizeof(uint))]
|
|
public uint V0;
|
|
|
|
[FieldOffset(1 * sizeof(uint))]
|
|
public uint V1;
|
|
|
|
[FieldOffset(2 * sizeof(uint))]
|
|
public uint V2;
|
|
|
|
[FieldOffset(3 * sizeof(uint))]
|
|
public uint V3;
|
|
|
|
[FieldOffset(4 * sizeof(uint))]
|
|
public uint V4;
|
|
|
|
[FieldOffset(5 * sizeof(uint))]
|
|
public uint V5;
|
|
|
|
[FieldOffset(6 * sizeof(uint))]
|
|
public uint V6;
|
|
|
|
[FieldOffset(7 * sizeof(uint))]
|
|
public uint V7;
|
|
|
|
public override string ToString()
|
|
{
|
|
return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
|
|
}
|
|
}
|
|
|
|
[StructLayout(LayoutKind.Explicit, Size = 8)]
|
|
public struct OfByte
|
|
{
|
|
[FieldOffset(0)]
|
|
public byte V0;
|
|
|
|
[FieldOffset(1)]
|
|
public byte V1;
|
|
|
|
[FieldOffset(2)]
|
|
public byte V2;
|
|
|
|
[FieldOffset(3)]
|
|
public byte V3;
|
|
|
|
[FieldOffset(4)]
|
|
public byte V4;
|
|
|
|
[FieldOffset(5)]
|
|
public byte V5;
|
|
|
|
[FieldOffset(6)]
|
|
public byte V6;
|
|
|
|
[FieldOffset(7)]
|
|
public byte V7;
|
|
|
|
public override string ToString()
|
|
{
|
|
return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
|
|
}
|
|
|
|
public void LoadFrom(ref OfUInt32 i)
|
|
{
|
|
this.V0 = (byte)i.V0;
|
|
this.V1 = (byte)i.V1;
|
|
this.V2 = (byte)i.V2;
|
|
this.V3 = (byte)i.V3;
|
|
this.V4 = (byte)i.V4;
|
|
this.V5 = (byte)i.V5;
|
|
this.V6 = (byte)i.V6;
|
|
this.V7 = (byte)i.V7;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|