Browse Source

Port ColorSpaceTransformUtils

pull/2933/head
James Jackson-South 8 months ago
parent
commit
82bc79744b
  1. 103
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  2. 93
      src/ImageSharp/Common/Helpers/Vector256Utilities.cs
  3. 142
      src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
  4. 8
      tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs

103
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

@ -304,6 +304,37 @@ internal static class Vector128_
return Vector128.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Packs signed 32-bit integers to unsigned 16-bit integers and saturates.
/// </summary>
/// <param name="left">The left hand source vector.</param>
/// <param name="right">The right hand source vector.</param>
/// <returns>The <see cref="Vector128{UInt16}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<ushort> PackUnsignedSaturate(Vector128<int> left, Vector128<int> right)
{
if (Sse41.IsSupported)
{
return Sse41.PackUnsignedSaturate(left, right);
}
if (AdvSimd.IsSupported)
{
return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right);
}
if (PackedSimd.IsSupported)
{
return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right);
}
Vector128<int> min = Vector128.Create((int)ushort.MinValue);
Vector128<int> max = Vector128.Create((int)ushort.MaxValue);
Vector128<uint> lefClamped = Clamp(left, min, max).AsUInt32();
Vector128<uint> rightClamped = Clamp(right, min, max).AsUInt32();
return Vector128.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
/// </summary>
@ -347,6 +378,78 @@ internal static class Vector128_
public static Vector128<T> Clamp<T>(Vector128<T> value, Vector128<T> min, Vector128<T> max)
=> Vector128.Min(Vector128.Max(value, min), max);
/// <summary>
/// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
/// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 16-bit integers to multiply.
/// </param>
/// <param name="right">
/// The second vector containing packed 16-bit integers to multiply.
/// </param>
/// <returns>
/// A vector containing the low 16 bits of the products of the packed 16-bit integers
/// from <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<short> MultiplyLow(Vector128<short> left, Vector128<short> right)
{
if (Sse2.IsSupported)
{
return Sse2.MultiplyLow(left, right);
}
// Widen each half of the short vectors into two int vectors
(Vector128<int> leftLower, Vector128<int> leftUpper) = Vector128.Widen(left);
(Vector128<int> rightLower, Vector128<int> rightUpper) = Vector128.Widen(right);
// Elementwise multiply: each int lane now holds the full 32-bit product
Vector128<int> prodLo = leftLower * rightLower;
Vector128<int> prodHi = leftUpper * rightUpper;
// Narrow the two int vectors back into one short vector
return Vector128.Narrow(prodLo, prodHi);
}
/// <summary>
/// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
/// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 16-bit integers to multiply.
/// </param>
/// <param name="right">
/// The second vector containing packed 16-bit integers to multiply.
/// </param>
/// <returns>
/// A vector containing the high 16 bits of the products of the packed 16-bit integers
/// from <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<short> MultiplyHigh(Vector128<short> left, Vector128<short> right)
{
if (Sse2.IsSupported)
{
return Sse2.MultiplyHigh(left, right);
}
// Widen each half of the short vectors into two int vectors
(Vector128<int> leftLower, Vector128<int> leftUpper) = Vector128.Widen(left);
(Vector128<int> rightLower, Vector128<int> rightUpper) = Vector128.Widen(right);
// Elementwise multiply: each int lane now holds the full 32-bit product
Vector128<int> prodLo = leftLower * rightLower;
Vector128<int> prodHi = leftUpper * rightUpper;
// Arithmetic shift right by 16 bits to extract the high word
prodLo >>= 16;
prodHi >>= 16;
// Narrow the two int vectors back into one short vector
return Vector128.Narrow(prodLo, prodHi);
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

93
src/ImageSharp/Common/Helpers/Vector256Utilities.cs

@ -162,6 +162,27 @@ internal static class Vector256_
return (vm0 * vm1) - vs;
}
/// <summary>
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
/// </summary>
/// <param name="left">The left hand source vector.</param>
/// <param name="right">The right hand source vector.</param>
/// <returns>The <see cref="Vector256{UInt16}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<ushort> PackUnsignedSaturate(Vector256<int> left, Vector256<int> right)
{
if (Avx2.IsSupported)
{
return Avx2.PackUnsignedSaturate(left, right);
}
Vector256<int> min = Vector256.Create((int)ushort.MinValue);
Vector256<int> max = Vector256.Create((int)ushort.MaxValue);
Vector256<uint> lefClamped = Clamp(left, min, max).AsUInt32();
Vector256<uint> rightClamped = Clamp(right, min, max).AsUInt32();
return Vector256.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
/// </summary>
@ -210,6 +231,78 @@ internal static class Vector256_
return Vector256.WidenLower(value.ToVector256());
}
/// <summary>
/// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
/// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 16-bit integers to multiply.
/// </param>
/// <param name="right">
/// The second vector containing packed 16-bit integers to multiply.
/// </param>
/// <returns>
/// A vector containing the low 16 bits of the products of the packed 16-bit integers
/// from <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<short> MultiplyLow(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.MultiplyLow(left, right);
}
// Widen each half of the short vectors into two int vectors
(Vector256<int> leftLower, Vector256<int> leftUpper) = Vector256.Widen(left);
(Vector256<int> rightLower, Vector256<int> rightUpper) = Vector256.Widen(right);
// Elementwise multiply: each int lane now holds the full 32-bit product
Vector256<int> prodLo = leftLower * rightLower;
Vector256<int> prodHi = leftUpper * rightUpper;
// Narrow the two int vectors back into one short vector
return Vector256.Narrow(prodLo, prodHi);
}
/// <summary>
/// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
/// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 16-bit integers to multiply.
/// </param>
/// <param name="right">
/// The second vector containing packed 16-bit integers to multiply.
/// </param>
/// <returns>
/// A vector containing the high 16 bits of the products of the packed 16-bit integers
/// from <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<short> MultiplyHigh(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.MultiplyHigh(left, right);
}
// Widen each half of the short vectors into two int vectors
(Vector256<int> leftLower, Vector256<int> leftUpper) = Vector256.Widen(left);
(Vector256<int> rightLower, Vector256<int> rightUpper) = Vector256.Widen(right);
// Elementwise multiply: each int lane now holds the full 32-bit product
Vector256<int> prodLo = leftLower * rightLower;
Vector256<int> prodHi = leftUpper * rightUpper;
// Arithmetic shift right by 16 bits to extract the high word
prodLo >>= 16;
prodHi >>= 16;
// Narrow the two int vectors back into one short vector
return Vector256.Narrow(prodLo, prodHi);
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

142
src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs

@ -4,7 +4,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Webp.Lossless;
@ -12,17 +12,17 @@ internal static class ColorSpaceTransformUtils
{
public static void CollectColorBlueTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
{
if (Avx2.IsSupported && tileWidth >= 16)
if (Vector256_.SupportsShuffleNativeByte && tileWidth >= 16)
{
const int span = 16;
Span<ushort> values = stackalloc ushort[span];
var collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
var collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
var collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
var collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
var collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
Vector256<byte> collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
Vector256<byte> collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
Vector256<byte> collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
Vector256<byte> collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector256<byte> collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
Vector256<short> multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
for (int y = 0; y < tileHeight; y++)
{
Span<uint> srcSpan = bgra[(y * stride)..];
@ -33,18 +33,18 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector256<byte> r0 = Avx2.Shuffle(input0, collectColorBlueTransformsShuffleLowMask256);
Vector256<byte> r1 = Avx2.Shuffle(input1, collectColorBlueTransformsShuffleHighMask256);
Vector256<byte> r = Avx2.Or(r0, r1);
Vector256<byte> gb0 = Avx2.And(input0, collectColorBlueTransformsGreenBlueMask256);
Vector256<byte> gb1 = Avx2.And(input1, collectColorBlueTransformsGreenBlueMask256);
Vector256<ushort> gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector256<byte> g = Avx2.And(gb.AsByte(), collectColorBlueTransformsGreenMask256);
Vector256<short> a = Avx2.MultiplyHigh(r.AsInt16(), multsr);
Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg);
Vector256<byte> c = Avx2.Subtract(gb.AsByte(), b.AsByte());
Vector256<byte> d = Avx2.Subtract(c, a.AsByte());
Vector256<byte> e = Avx2.And(d, collectColorBlueTransformsBlueMask256);
Vector256<byte> r0 = Vector256_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask256);
Vector256<byte> r1 = Vector256_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask256);
Vector256<byte> r = r0 | r1;
Vector256<byte> gb0 = input0 & collectColorBlueTransformsGreenBlueMask256;
Vector256<byte> gb1 = input1 & collectColorBlueTransformsGreenBlueMask256;
Vector256<ushort> gb = Vector256_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector256<byte> g = gb.AsByte() & collectColorBlueTransformsGreenMask256;
Vector256<short> a = Vector256_.MultiplyHigh(r.AsInt16(), multsr);
Vector256<short> b = Vector256_.MultiplyHigh(g.AsInt16(), multsg);
Vector256<byte> c = gb.AsByte() - b.AsByte();
Vector256<byte> d = c - a.AsByte();
Vector256<byte> e = d & collectColorBlueTransformsBlueMask256;
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = e.AsUInt16();
@ -59,20 +59,20 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
}
}
else if (Sse41.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
const int span = 8;
Span<ushort> values = stackalloc ushort[span];
var collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
var collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
var collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
var collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
var collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
Vector128<byte> collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
Vector128<byte> collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
Vector128<byte> collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
Vector128<byte> collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector128<byte> collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
Vector128<short> multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
for (int y = 0; y < tileHeight; y++)
{
Span<uint> srcSpan = bgra[(y * stride)..];
@ -83,18 +83,18 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector128<byte> r0 = Ssse3.Shuffle(input0, collectColorBlueTransformsShuffleLowMask);
Vector128<byte> r1 = Ssse3.Shuffle(input1, collectColorBlueTransformsShuffleHighMask);
Vector128<byte> r = Sse2.Or(r0, r1);
Vector128<byte> gb0 = Sse2.And(input0, collectColorBlueTransformsGreenBlueMask);
Vector128<byte> gb1 = Sse2.And(input1, collectColorBlueTransformsGreenBlueMask);
Vector128<ushort> gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector128<byte> g = Sse2.And(gb.AsByte(), collectColorBlueTransformsGreenMask);
Vector128<short> a = Sse2.MultiplyHigh(r.AsInt16(), multsr);
Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg);
Vector128<byte> c = Sse2.Subtract(gb.AsByte(), b.AsByte());
Vector128<byte> d = Sse2.Subtract(c, a.AsByte());
Vector128<byte> e = Sse2.And(d, collectColorBlueTransformsBlueMask);
Vector128<byte> r0 = Vector128_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask);
Vector128<byte> r1 = Vector128_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask);
Vector128<byte> r = r0 | r1;
Vector128<byte> gb0 = input0 & collectColorBlueTransformsGreenBlueMask;
Vector128<byte> gb1 = input1 & collectColorBlueTransformsGreenBlueMask;
Vector128<ushort> gb = Vector128_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector128<byte> g = gb.AsByte() & collectColorBlueTransformsGreenMask;
Vector128<short> a = Vector128_.MultiplyHigh(r.AsInt16(), multsr);
Vector128<short> b = Vector128_.MultiplyHigh(g.AsInt16(), multsg);
Vector128<byte> c = gb.AsByte() - b.AsByte();
Vector128<byte> d = c - a.AsByte();
Vector128<byte> e = d & collectColorBlueTransformsBlueMask;
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = e.AsUInt16();
@ -109,16 +109,16 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
}
}
else
{
CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
CollectColorBlueTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
}
}
private static void CollectColorBlueTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
private static void CollectColorBlueTransformsScalar(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
{
int pos = 0;
while (tileHeight-- > 0)
@ -135,11 +135,11 @@ internal static class ColorSpaceTransformUtils
public static void CollectColorRedTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
{
if (Avx2.IsSupported && tileWidth >= 16)
if (Vector256.IsHardwareAccelerated && tileWidth >= 16)
{
Vector256<byte> collectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte();
Vector256<byte> collectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte();
var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
const int span = 16;
Span<ushort> values = stackalloc ushort[span];
for (int y = 0; y < tileHeight; y++)
@ -152,15 +152,15 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector256<byte> g0 = Avx2.And(input0, collectColorRedTransformsGreenMask256); // 0 0 | g 0
Vector256<byte> g1 = Avx2.And(input1, collectColorRedTransformsGreenMask256);
Vector256<ushort> g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector256<int> a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector256<int> a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16);
Vector256<ushort> a = Avx2.PackUnsignedSaturate(a0, a1); // x r
Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector256<byte> c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r'
Vector256<byte> d = Avx2.And(c, collectColorRedTransformsAndMask256); // 0 r'
Vector256<byte> g0 = input0 & collectColorRedTransformsGreenMask256; // 0 0 | g 0
Vector256<byte> g1 = input1 & collectColorRedTransformsGreenMask256;
Vector256<ushort> g = Vector256_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector256<int> a0 = Vector256.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector256<int> a1 = Vector256.ShiftRightLogical(input1.AsInt32(), 16);
Vector256<ushort> a = Vector256_.PackUnsignedSaturate(a0, a1); // x r
Vector256<short> b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector256<byte> c = a.AsByte() - b.AsByte(); // x r'
Vector256<byte> d = c & collectColorRedTransformsAndMask256; // 0 r'
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = d.AsUInt16();
@ -175,14 +175,14 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
}
}
else if (Sse41.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> collectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte();
Vector128<byte> collectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte();
var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
const int span = 8;
Span<ushort> values = stackalloc ushort[span];
for (int y = 0; y < tileHeight; y++)
@ -195,15 +195,15 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector128<byte> g0 = Sse2.And(input0, collectColorRedTransformsGreenMask); // 0 0 | g 0
Vector128<byte> g1 = Sse2.And(input1, collectColorRedTransformsGreenMask);
Vector128<ushort> g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector128<int> a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector128<int> a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16);
Vector128<ushort> a = Sse41.PackUnsignedSaturate(a0, a1); // x r
Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector128<byte> c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r'
Vector128<byte> d = Sse2.And(c, collectColorRedTransformsAndMask); // 0 r'
Vector128<byte> g0 = input0 & collectColorRedTransformsGreenMask; // 0 0 | g 0
Vector128<byte> g1 = input1 & collectColorRedTransformsGreenMask;
Vector128<ushort> g = Vector128_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector128<int> a0 = Vector128.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector128<int> a1 = Vector128.ShiftRightLogical(input1.AsInt32(), 16);
Vector128<ushort> a = Vector128_.PackUnsignedSaturate(a0, a1); // x r
Vector128<short> b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector128<byte> c = a.AsByte() - b.AsByte(); // x r'
Vector128<byte> d = c & collectColorRedTransformsAndMask; // 0 r'
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = d.AsUInt16();
@ -218,16 +218,16 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
}
}
else
{
CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
CollectColorRedTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
}
}
private static void CollectColorRedTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
private static void CollectColorRedTransformsScalar(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
{
int pos = 0;
while (tileHeight-- > 0)

8
tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs

@ -71,17 +71,17 @@ public class ColorSpaceTransformUtilsTests
public void CollectColorBlueTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.AllowAll);
[Fact]
public void CollectColorBlueTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);
public void CollectColorBlueTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);
[Fact]
public void CollectColorBlueTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);
public void CollectColorBlueTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);
[Fact]
public void CollectColorRedTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.AllowAll);
[Fact]
public void CollectColorRedTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);
public void CollectColorRedTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);
[Fact]
public void CollectColorRedTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
public void CollectColorRedTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
}

Loading…
Cancel
Save