diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index dbe0a1fce..f89900d7e 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -304,6 +304,37 @@ internal static class Vector128_
return Vector128.Narrow(lefClamped, rightClamped);
}
+ ///
+ /// Packs signed 32-bit integers to unsigned 16-bit integers and saturates.
+ ///
+ /// The left hand source vector.
+ /// The right hand source vector.
+ /// The .
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 PackUnsignedSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse41.IsSupported)
+ {
+ return Sse41.PackUnsignedSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right);
+ }
+
+ Vector128 min = Vector128.Create((int)ushort.MinValue);
+ Vector128 max = Vector128.Create((int)ushort.MaxValue);
+ Vector128 lefClamped = Clamp(left, min, max).AsUInt32();
+ Vector128 rightClamped = Clamp(right, min, max).AsUInt32();
+ return Vector128.Narrow(lefClamped, rightClamped);
+ }
+
///
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
///
@@ -347,6 +378,78 @@ internal static class Vector128_
public static Vector128 Clamp(Vector128 value, Vector128 min, Vector128 max)
=> Vector128.Min(Vector128.Max(value, min), max);
+ ///
+ /// Multiply the packed 16-bit integers in and , producing
+ /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
+ ///
+ ///
+ /// The first vector containing packed 16-bit integers to multiply.
+ ///
+ ///
+ /// The second vector containing packed 16-bit integers to multiply.
+ ///
+ ///
+ /// A vector containing the low 16 bits of the products of the packed 16-bit integers
+ /// from and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 MultiplyLow(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.MultiplyLow(left, right);
+ }
+
+ // Widen each half of the short vectors into two int vectors
+ (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left);
+ (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right);
+
+ // Elementwise multiply: each int lane now holds the full 32-bit product
+ Vector128 prodLo = leftLower * rightLower;
+ Vector128 prodHi = leftUpper * rightUpper;
+
+ // Narrow the two int vectors back into one short vector
+ return Vector128.Narrow(prodLo, prodHi);
+ }
+
+ ///
+ /// Multiply the packed 16-bit integers in and , producing
+ /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
+ ///
+ ///
+ /// The first vector containing packed 16-bit integers to multiply.
+ ///
+ ///
+ /// The second vector containing packed 16-bit integers to multiply.
+ ///
+ ///
+ /// A vector containing the high 16 bits of the products of the packed 16-bit integers
+ /// from and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 MultiplyHigh(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.MultiplyHigh(left, right);
+ }
+
+ // Widen each half of the short vectors into two int vectors
+ (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left);
+ (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right);
+
+ // Elementwise multiply: each int lane now holds the full 32-bit product
+ Vector128 prodLo = leftLower * rightLower;
+ Vector128 prodHi = leftUpper * rightUpper;
+
+ // Arithmetic shift right by 16 bits to extract the high word
+ prodLo >>= 16;
+ prodHi >>= 16;
+
+ // Narrow the two int vectors back into one short vector
+ return Vector128.Narrow(prodLo, prodHi);
+ }
+
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
index 817d6e607..dfefd2d34 100644
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -162,6 +162,27 @@ internal static class Vector256_
return (vm0 * vm1) - vs;
}
+ ///
+ /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
+ ///
+ /// The left hand source vector.
+ /// The right hand source vector.
+ /// The .
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector256 PackUnsignedSaturate(Vector256 left, Vector256 right)
+ {
+ if (Avx2.IsSupported)
+ {
+ return Avx2.PackUnsignedSaturate(left, right);
+ }
+
+ Vector256 min = Vector256.Create((int)ushort.MinValue);
+ Vector256 max = Vector256.Create((int)ushort.MaxValue);
+ Vector256 lefClamped = Clamp(left, min, max).AsUInt32();
+ Vector256 rightClamped = Clamp(right, min, max).AsUInt32();
+ return Vector256.Narrow(lefClamped, rightClamped);
+ }
+
///
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
///
@@ -210,6 +231,78 @@ internal static class Vector256_
return Vector256.WidenLower(value.ToVector256());
}
+ ///
+ /// Multiply the packed 16-bit integers in and , producing
+ /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
+ ///
+ ///
+ /// The first vector containing packed 16-bit integers to multiply.
+ ///
+ ///
+ /// The second vector containing packed 16-bit integers to multiply.
+ ///
+ ///
+ /// A vector containing the low 16 bits of the products of the packed 16-bit integers
+ /// from and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector256 MultiplyLow(Vector256 left, Vector256 right)
+ {
+ if (Avx2.IsSupported)
+ {
+ return Avx2.MultiplyLow(left, right);
+ }
+
+ // Widen each half of the short vectors into two int vectors
+ (Vector256 leftLower, Vector256 leftUpper) = Vector256.Widen(left);
+ (Vector256 rightLower, Vector256 rightUpper) = Vector256.Widen(right);
+
+ // Elementwise multiply: each int lane now holds the full 32-bit product
+ Vector256 prodLo = leftLower * rightLower;
+ Vector256 prodHi = leftUpper * rightUpper;
+
+ // Narrow the two int vectors back into one short vector
+ return Vector256.Narrow(prodLo, prodHi);
+ }
+
+ ///
+ /// Multiply the packed 16-bit integers in and , producing
+ /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
+ ///
+ ///
+ /// The first vector containing packed 16-bit integers to multiply.
+ ///
+ ///
+ /// The second vector containing packed 16-bit integers to multiply.
+ ///
+ ///
+ /// A vector containing the high 16 bits of the products of the packed 16-bit integers
+ /// from and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector256 MultiplyHigh(Vector256 left, Vector256 right)
+ {
+ if (Avx2.IsSupported)
+ {
+ return Avx2.MultiplyHigh(left, right);
+ }
+
+ // Widen each half of the short vectors into two int vectors
+ (Vector256 leftLower, Vector256 leftUpper) = Vector256.Widen(left);
+ (Vector256 rightLower, Vector256 rightUpper) = Vector256.Widen(right);
+
+ // Elementwise multiply: each int lane now holds the full 32-bit product
+ Vector256 prodLo = leftLower * rightLower;
+ Vector256 prodHi = leftUpper * rightUpper;
+
+ // Arithmetic shift right by 16 bits to extract the high word
+ prodLo >>= 16;
+ prodHi >>= 16;
+
+ // Narrow the two int vectors back into one short vector
+ return Vector256.Narrow(prodLo, prodHi);
+ }
+
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}
diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
index 9a6dfb66e..5c6fb5604 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
@@ -4,7 +4,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Webp.Lossless;
@@ -12,17 +12,17 @@ internal static class ColorSpaceTransformUtils
{
public static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo)
{
- if (Avx2.IsSupported && tileWidth >= 16)
+ if (Vector256_.SupportsShuffleNativeByte && tileWidth >= 16)
{
const int span = 16;
Span values = stackalloc ushort[span];
- var collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
- var collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
- var collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
- var collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
- var collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
- var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
- var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
+ Vector256 collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
+ Vector256 collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
+ Vector256 collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+ Vector256 collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ Vector256 collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+ Vector256 multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
+ Vector256 multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
for (int y = 0; y < tileHeight; y++)
{
Span srcSpan = bgra[(y * stride)..];
@@ -33,18 +33,18 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
- Vector256 r0 = Avx2.Shuffle(input0, collectColorBlueTransformsShuffleLowMask256);
- Vector256 r1 = Avx2.Shuffle(input1, collectColorBlueTransformsShuffleHighMask256);
- Vector256 r = Avx2.Or(r0, r1);
- Vector256 gb0 = Avx2.And(input0, collectColorBlueTransformsGreenBlueMask256);
- Vector256 gb1 = Avx2.And(input1, collectColorBlueTransformsGreenBlueMask256);
- Vector256 gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
- Vector256 g = Avx2.And(gb.AsByte(), collectColorBlueTransformsGreenMask256);
- Vector256 a = Avx2.MultiplyHigh(r.AsInt16(), multsr);
- Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg);
- Vector256 c = Avx2.Subtract(gb.AsByte(), b.AsByte());
- Vector256 d = Avx2.Subtract(c, a.AsByte());
- Vector256 e = Avx2.And(d, collectColorBlueTransformsBlueMask256);
+ Vector256 r0 = Vector256_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask256);
+ Vector256 r1 = Vector256_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask256);
+ Vector256 r = r0 | r1;
+ Vector256 gb0 = input0 & collectColorBlueTransformsGreenBlueMask256;
+ Vector256 gb1 = input1 & collectColorBlueTransformsGreenBlueMask256;
+ Vector256 gb = Vector256_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
+ Vector256 g = gb.AsByte() & collectColorBlueTransformsGreenMask256;
+ Vector256 a = Vector256_.MultiplyHigh(r.AsInt16(), multsr);
+ Vector256 b = Vector256_.MultiplyHigh(g.AsInt16(), multsg);
+ Vector256 c = gb.AsByte() - b.AsByte();
+ Vector256 d = c - a.AsByte();
+ Vector256 e = d & collectColorBlueTransformsBlueMask256;
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As>(ref outputRef) = e.AsUInt16();
@@ -59,20 +59,20 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
- CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
+ CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
}
}
- else if (Sse41.IsSupported)
+ else if (Vector128.IsHardwareAccelerated)
{
const int span = 8;
Span values = stackalloc ushort[span];
- var collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
- var collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
- var collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
- var collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
- var collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
- var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
- var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
+ Vector128 collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
+ Vector128 collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
+ Vector128 collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+ Vector128 collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ Vector128 collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+ Vector128 multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
+ Vector128 multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
for (int y = 0; y < tileHeight; y++)
{
Span srcSpan = bgra[(y * stride)..];
@@ -83,18 +83,18 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
- Vector128 r0 = Ssse3.Shuffle(input0, collectColorBlueTransformsShuffleLowMask);
- Vector128 r1 = Ssse3.Shuffle(input1, collectColorBlueTransformsShuffleHighMask);
- Vector128 r = Sse2.Or(r0, r1);
- Vector128 gb0 = Sse2.And(input0, collectColorBlueTransformsGreenBlueMask);
- Vector128 gb1 = Sse2.And(input1, collectColorBlueTransformsGreenBlueMask);
- Vector128 gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
- Vector128 g = Sse2.And(gb.AsByte(), collectColorBlueTransformsGreenMask);
- Vector128 a = Sse2.MultiplyHigh(r.AsInt16(), multsr);
- Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg);
- Vector128 c = Sse2.Subtract(gb.AsByte(), b.AsByte());
- Vector128 d = Sse2.Subtract(c, a.AsByte());
- Vector128 e = Sse2.And(d, collectColorBlueTransformsBlueMask);
+ Vector128 r0 = Vector128_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask);
+ Vector128 r1 = Vector128_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask);
+ Vector128 r = r0 | r1;
+ Vector128 gb0 = input0 & collectColorBlueTransformsGreenBlueMask;
+ Vector128 gb1 = input1 & collectColorBlueTransformsGreenBlueMask;
+ Vector128 gb = Vector128_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
+ Vector128 g = gb.AsByte() & collectColorBlueTransformsGreenMask;
+ Vector128 a = Vector128_.MultiplyHigh(r.AsInt16(), multsr);
+ Vector128 b = Vector128_.MultiplyHigh(g.AsInt16(), multsg);
+ Vector128 c = gb.AsByte() - b.AsByte();
+ Vector128 d = c - a.AsByte();
+ Vector128 e = d & collectColorBlueTransformsBlueMask;
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As>(ref outputRef) = e.AsUInt16();
@@ -109,16 +109,16 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
- CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
+ CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
}
}
else
{
- CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
+ CollectColorBlueTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
}
}
- private static void CollectColorBlueTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo)
+ private static void CollectColorBlueTransformsScalar(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo)
{
int pos = 0;
while (tileHeight-- > 0)
@@ -135,11 +135,11 @@ internal static class ColorSpaceTransformUtils
public static void CollectColorRedTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo)
{
- if (Avx2.IsSupported && tileWidth >= 16)
+ if (Vector256.IsHardwareAccelerated && tileWidth >= 16)
{
Vector256 collectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte();
Vector256 collectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte();
- var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
+ Vector256 multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
const int span = 16;
Span values = stackalloc ushort[span];
for (int y = 0; y < tileHeight; y++)
@@ -152,15 +152,15 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
- Vector256 g0 = Avx2.And(input0, collectColorRedTransformsGreenMask256); // 0 0 | g 0
- Vector256 g1 = Avx2.And(input1, collectColorRedTransformsGreenMask256);
- Vector256 g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
- Vector256 a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
- Vector256 a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16);
- Vector256 a = Avx2.PackUnsignedSaturate(a0, a1); // x r
- Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr
- Vector256 c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r'
- Vector256 d = Avx2.And(c, collectColorRedTransformsAndMask256); // 0 r'
+ Vector256 g0 = input0 & collectColorRedTransformsGreenMask256; // 0 0 | g 0
+ Vector256 g1 = input1 & collectColorRedTransformsGreenMask256;
+ Vector256 g = Vector256_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
+ Vector256 a0 = Vector256.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
+ Vector256 a1 = Vector256.ShiftRightLogical(input1.AsInt32(), 16);
+ Vector256 a = Vector256_.PackUnsignedSaturate(a0, a1); // x r
+ Vector256 b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); // x dr
+ Vector256 c = a.AsByte() - b.AsByte(); // x r'
+ Vector256 d = c & collectColorRedTransformsAndMask256; // 0 r'
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As>(ref outputRef) = d.AsUInt16();
@@ -175,14 +175,14 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
- CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
+ CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
}
}
- else if (Sse41.IsSupported)
+ else if (Vector128.IsHardwareAccelerated)
{
Vector128 collectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte();
Vector128 collectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte();
- var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
+ Vector128 multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
const int span = 8;
Span values = stackalloc ushort[span];
for (int y = 0; y < tileHeight; y++)
@@ -195,15 +195,15 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
- Vector128 g0 = Sse2.And(input0, collectColorRedTransformsGreenMask); // 0 0 | g 0
- Vector128 g1 = Sse2.And(input1, collectColorRedTransformsGreenMask);
- Vector128 g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
- Vector128 a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
- Vector128 a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16);
- Vector128 a = Sse41.PackUnsignedSaturate(a0, a1); // x r
- Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr
- Vector128 c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r'
- Vector128 d = Sse2.And(c, collectColorRedTransformsAndMask); // 0 r'
+ Vector128 g0 = input0 & collectColorRedTransformsGreenMask; // 0 0 | g 0
+ Vector128 g1 = input1 & collectColorRedTransformsGreenMask;
+ Vector128 g = Vector128_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
+ Vector128 a0 = Vector128.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
+ Vector128 a1 = Vector128.ShiftRightLogical(input1.AsInt32(), 16);
+ Vector128 a = Vector128_.PackUnsignedSaturate(a0, a1); // x r
+ Vector128 b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); // x dr
+ Vector128 c = a.AsByte() - b.AsByte(); // x r'
+ Vector128 d = c & collectColorRedTransformsAndMask; // 0 r'
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As>(ref outputRef) = d.AsUInt16();
@@ -218,16 +218,16 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
- CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
+ CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
}
}
else
{
- CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
+ CollectColorRedTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
}
}
- private static void CollectColorRedTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo)
+ private static void CollectColorRedTransformsScalar(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo)
{
int pos = 0;
while (tileHeight-- > 0)
diff --git a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs
index c5e8c975f..6073888fe 100644
--- a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs
@@ -71,17 +71,17 @@ public class ColorSpaceTransformUtilsTests
public void CollectColorBlueTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.AllowAll);
[Fact]
- public void CollectColorBlueTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);
+ public void CollectColorBlueTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);
[Fact]
- public void CollectColorBlueTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);
+ public void CollectColorBlueTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);
[Fact]
public void CollectColorRedTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.AllowAll);
[Fact]
- public void CollectColorRedTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);
+ public void CollectColorRedTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);
[Fact]
- public void CollectColorRedTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
+ public void CollectColorRedTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
}