📷 A modern, cross-platform, 2D Graphics library for .NET
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

201 lines
8.0 KiB

// Copyright (c) Six Labors and contributors.
// Licensed under the Apache License, Version 2.0.
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.Memory;
namespace SixLabors.ImageSharp.PixelFormats
{
/// <content>
/// Provides optimized overrides for bulk operations.
/// </content>
public partial struct Rgba32
{
/// <summary>
/// <see cref="PixelOperations{TPixel}"/> implementation optimized for <see cref="Rgba32"/>.
/// </summary>
internal partial class PixelOperations : PixelOperations<Rgba32>
{
/// <summary>
/// SIMD optimized bulk implementation of <see cref="IPixel.PackFromVector4(Vector4)"/>
/// that works only with `count` divisible by <see cref="Vector{UInt32}.Count"/>.
/// </summary>
/// <param name="sourceColors">The <see cref="Span{T}"/> to the source colors.</param>
/// <param name="destVectors">The <see cref="Span{T}"/> to the dstination vectors.</param>
/// <param name="count">The number of pixels to convert.</param>
/// <remarks>
/// Implementation adapted from:
/// <see>
/// <cref>http://stackoverflow.com/a/5362789</cref>
/// </see>
/// TODO: We can replace this implementation in the future using new Vector API-s:
/// <see>
/// <cref>https://github.com/dotnet/corefx/issues/15957</cref>
/// </see>
/// </remarks>
internal static void ToVector4SimdAligned(ReadOnlySpan<Rgba32> sourceColors, Span<Vector4> destVectors, int count)
{
if (!Vector.IsHardwareAccelerated)
{
throw new InvalidOperationException(
"Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!");
}
DebugGuard.IsTrue(
count % Vector<uint>.Count == 0,
nameof(count),
"Argument 'count' should divisible by Vector<uint>.Count!");
var bVec = new Vector<float>(256.0f / 255.0f);
var magicFloat = new Vector<float>(32768.0f);
var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
var mask = new Vector<uint>(255);
int unpackedRawCount = count * 4;
ref uint sourceBase = ref Unsafe.As<Rgba32, uint>(ref MemoryMarshal.GetReference(sourceColors));
ref WideRgba destBaseAsWide = ref Unsafe.As<Vector4, WideRgba>(ref MemoryMarshal.GetReference(destVectors));
ref Vector<uint> destBaseAsUInt = ref Unsafe.As<WideRgba, Vector<uint>>(ref destBaseAsWide);
ref Vector<float> destBaseAsFloat = ref Unsafe.As<WideRgba, Vector<float>>(ref destBaseAsWide);
for (int i = 0; i < count; i++)
{
uint sVal = Unsafe.Add(ref sourceBase, i);
ref WideRgba dst = ref Unsafe.Add(ref destBaseAsWide, i);
// This call is the bottleneck now:
dst.Load(sVal);
}
int numOfVectors = unpackedRawCount / Vector<uint>.Count;
for (int i = 0; i < numOfVectors; i++)
{
Vector<uint> vi = Unsafe.Add(ref destBaseAsUInt, i);
vi &= mask;
vi |= magicInt;
var vf = Vector.AsVectorSingle(vi);
vf = (vf - magicFloat) * bVec;
Unsafe.Add(ref destBaseAsFloat, i) = vf;
}
}
/// <inheritdoc />
internal override void ToVector4(ReadOnlySpan<Rgba32> sourceColors, Span<Vector4> destinationVectors, int count)
{
Guard.MustBeSizedAtLeast(sourceColors, count, nameof(sourceColors));
Guard.MustBeSizedAtLeast(destinationVectors, count, nameof(destinationVectors));
if (count < 256 || !Vector.IsHardwareAccelerated)
{
// Doesn't worth to bother with SIMD:
base.ToVector4(sourceColors, destinationVectors, count);
return;
}
int remainder = count % Vector<uint>.Count;
int alignedCount = count - remainder;
if (alignedCount > 0)
{
ToVector4SimdAligned(sourceColors, destinationVectors, alignedCount);
}
if (remainder > 0)
{
sourceColors = sourceColors.Slice(alignedCount);
destinationVectors = destinationVectors.Slice(alignedCount);
base.ToVector4(sourceColors, destinationVectors, remainder);
}
}
/// <inheritdoc />
internal override void PackFromVector4(ReadOnlySpan<Vector4> sourceVectors, Span<Rgba32> destinationColors, int count)
{
GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count);
if (!SimdUtils.IsAvx2CompatibleArchitecture)
{
base.PackFromVector4(sourceVectors, destinationColors, count);
return;
}
int remainder = count % 2;
int alignedCount = count - remainder;
if (alignedCount > 0)
{
ReadOnlySpan<float> flatSrc = MemoryMarshal.Cast<Vector4, float>(sourceVectors.Slice(0, alignedCount));
Span<byte> flatDest = MemoryMarshal.Cast<Rgba32, byte>(destinationColors);
SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(flatSrc, flatDest);
}
if (remainder > 0)
{
// actually: remainder == 1
int lastIdx = count - 1;
destinationColors[lastIdx].PackFromVector4(sourceVectors[lastIdx]);
}
}
/// <inheritdoc />
internal override void ToScaledVector4(ReadOnlySpan<Rgba32> sourceColors, Span<Vector4> destinationVectors, int count)
{
this.ToVector4(sourceColors, destinationVectors, count);
}
/// <inheritdoc />
internal override void PackFromScaledVector4(ReadOnlySpan<Vector4> sourceVectors, Span<Rgba32> destinationColors, int count)
{
this.PackFromVector4(sourceVectors, destinationColors, count);
}
/// <inheritdoc />
internal override void PackFromRgba32(ReadOnlySpan<Rgba32> source, Span<Rgba32> destPixels, int count)
{
GuardSpans(source, nameof(source), destPixels, nameof(destPixels), count);
source.Slice(0, count).CopyTo(destPixels);
}
/// <inheritdoc />
internal override void ToRgba32(ReadOnlySpan<Rgba32> sourcePixels, Span<Rgba32> dest, int count)
{
GuardSpans(sourcePixels, nameof(sourcePixels), dest, nameof(dest), count);
sourcePixels.Slice(0, count).CopyTo(dest);
}
/// <summary>
/// Value type to store <see cref="Rgba32"/>-s widened into multiple <see cref="uint"/>-s.
/// </summary>
[StructLayout(LayoutKind.Sequential)]
private struct WideRgba
{
private uint r;
private uint g;
private uint b;
private uint a;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Load(uint p)
{
this.r = p;
this.g = p >> GreenShift;
this.b = p >> BlueShift;
this.a = p >> AlphaShift;
}
}
}
}
}