diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index dbe0a1fce..f89900d7e 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -304,6 +304,37 @@ internal static class Vector128_ return Vector128.Narrow(lefClamped, rightClamped); } + /// + /// Packs signed 32-bit integers to unsigned 16-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 PackUnsignedSaturate(Vector128 left, Vector128 right) + { + if (Sse41.IsSupported) + { + return Sse41.PackUnsignedSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right); + } + + Vector128 min = Vector128.Create((int)ushort.MinValue); + Vector128 max = Vector128.Create((int)ushort.MaxValue); + Vector128 lefClamped = Clamp(left, min, max).AsUInt32(); + Vector128 rightClamped = Clamp(right, min, max).AsUInt32(); + return Vector128.Narrow(lefClamped, rightClamped); + } + /// /// Packs signed 32-bit integers to signed 16-bit integers and saturates. /// @@ -347,6 +378,78 @@ internal static class Vector128_ public static Vector128 Clamp(Vector128 value, Vector128 min, Vector128 max) => Vector128.Min(Vector128.Max(value, min), max); + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the low 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyLow(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); + (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLower * rightLower; + Vector128 prodHi = leftUpper * rightUpper; + + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } + + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the high 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyHigh(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); + (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLower * rightLower; + Vector128 prodHi = leftUpper * rightUpper; + + // Arithmetic shift right by 16 bits to extract the high word + prodLo >>= 16; + prodHi >>= 16; + + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 817d6e607..dfefd2d34 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -162,6 +162,27 @@ internal static class Vector256_ return (vm0 * vm1) - vs; } + /// + /// Packs signed 32-bit integers to signed 16-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 PackUnsignedSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.PackUnsignedSaturate(left, right); + } + + Vector256 min = Vector256.Create((int)ushort.MinValue); + Vector256 max = Vector256.Create((int)ushort.MaxValue); + Vector256 lefClamped = Clamp(left, min, max).AsUInt32(); + Vector256 rightClamped = Clamp(right, min, max).AsUInt32(); + return Vector256.Narrow(lefClamped, rightClamped); + } + /// /// Packs signed 32-bit integers to signed 16-bit integers and saturates. /// @@ -210,6 +231,78 @@ internal static class Vector256_ return Vector256.WidenLower(value.ToVector256()); } + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the low 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplyLow(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.MultiplyLow(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector256 leftLower, Vector256 leftUpper) = Vector256.Widen(left); + (Vector256 rightLower, Vector256 rightUpper) = Vector256.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector256 prodLo = leftLower * rightLower; + Vector256 prodHi = leftUpper * rightUpper; + + // Narrow the two int vectors back into one short vector + return Vector256.Narrow(prodLo, prodHi); + } + + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the high 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplyHigh(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.MultiplyHigh(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector256 leftLower, Vector256 leftUpper) = Vector256.Widen(left); + (Vector256 rightLower, Vector256 rightUpper) = Vector256.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector256 prodLo = leftLower * rightLower; + Vector256 prodHi = leftUpper * rightUpper; + + // Arithmetic shift right by 16 bits to extract the high word + prodLo >>= 16; + prodHi >>= 16; + + // Narrow the two int vectors back into one short vector + return Vector256.Narrow(prodLo, prodHi); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs index 9a6dfb66e..5c6fb5604 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs @@ -4,7 +4,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Webp.Lossless; @@ -12,17 +12,17 @@ internal static class ColorSpaceTransformUtils { public static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) { - if (Avx2.IsSupported && tileWidth >= 16) + if (Vector256_.SupportsShuffleNativeByte && tileWidth >= 16) { const int span = 16; Span values = stackalloc ushort[span]; - var collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255); - var collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30); - var collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - var collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - var collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); - var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); - var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); + Vector256 collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255); + Vector256 collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30); + Vector256 collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + Vector256 collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + Vector256 collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + Vector256 multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); + Vector256 multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); for (int y = 0; y < tileHeight; y++) { Span srcSpan = bgra[(y * stride)..]; @@ -33,18 +33,18 @@ internal static class ColorSpaceTransformUtils nuint input1Idx = x + (span / 2); Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector256 r0 = Avx2.Shuffle(input0, collectColorBlueTransformsShuffleLowMask256); - Vector256 r1 = Avx2.Shuffle(input1, collectColorBlueTransformsShuffleHighMask256); - Vector256 r = Avx2.Or(r0, r1); - Vector256 gb0 = Avx2.And(input0, collectColorBlueTransformsGreenBlueMask256); - Vector256 gb1 = Avx2.And(input1, collectColorBlueTransformsGreenBlueMask256); - Vector256 gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); - Vector256 g = Avx2.And(gb.AsByte(), collectColorBlueTransformsGreenMask256); - Vector256 a = Avx2.MultiplyHigh(r.AsInt16(), multsr); - Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); - Vector256 c = Avx2.Subtract(gb.AsByte(), b.AsByte()); - Vector256 d = Avx2.Subtract(c, a.AsByte()); - Vector256 e = Avx2.And(d, collectColorBlueTransformsBlueMask256); + Vector256 r0 = Vector256_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask256); + Vector256 r1 = Vector256_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask256); + Vector256 r = r0 | r1; + Vector256 gb0 = input0 & collectColorBlueTransformsGreenBlueMask256; + Vector256 gb1 = input1 & collectColorBlueTransformsGreenBlueMask256; + Vector256 gb = Vector256_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector256 g = gb.AsByte() & collectColorBlueTransformsGreenMask256; + Vector256 a = Vector256_.MultiplyHigh(r.AsInt16(), multsr); + Vector256 b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); + Vector256 c = gb.AsByte() - b.AsByte(); + Vector256 d = c - a.AsByte(); + Vector256 e = d & collectColorBlueTransformsBlueMask256; ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = e.AsUInt16(); @@ -59,20 +59,20 @@ internal static class ColorSpaceTransformUtils int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); + CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); } } - else if (Sse41.IsSupported) + else if (Vector128.IsHardwareAccelerated) { const int span = 8; Span values = stackalloc ushort[span]; - var collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255); - var collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); - var collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - var collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - var collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); - var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue)); - var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue)); + Vector128 collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255); + Vector128 collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); + Vector128 collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + Vector128 collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + Vector128 collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + Vector128 multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue)); + Vector128 multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue)); for (int y = 0; y < tileHeight; y++) { Span srcSpan = bgra[(y * stride)..]; @@ -83,18 +83,18 @@ internal static class ColorSpaceTransformUtils nuint input1Idx = x + (span / 2); Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector128 r0 = Ssse3.Shuffle(input0, collectColorBlueTransformsShuffleLowMask); - Vector128 r1 = Ssse3.Shuffle(input1, collectColorBlueTransformsShuffleHighMask); - Vector128 r = Sse2.Or(r0, r1); - Vector128 gb0 = Sse2.And(input0, collectColorBlueTransformsGreenBlueMask); - Vector128 gb1 = Sse2.And(input1, collectColorBlueTransformsGreenBlueMask); - Vector128 gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); - Vector128 g = Sse2.And(gb.AsByte(), collectColorBlueTransformsGreenMask); - Vector128 a = Sse2.MultiplyHigh(r.AsInt16(), multsr); - Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); - Vector128 c = Sse2.Subtract(gb.AsByte(), b.AsByte()); - Vector128 d = Sse2.Subtract(c, a.AsByte()); - Vector128 e = Sse2.And(d, collectColorBlueTransformsBlueMask); + Vector128 r0 = Vector128_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask); + Vector128 r1 = Vector128_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask); + Vector128 r = r0 | r1; + Vector128 gb0 = input0 & collectColorBlueTransformsGreenBlueMask; + Vector128 gb1 = input1 & collectColorBlueTransformsGreenBlueMask; + Vector128 gb = Vector128_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector128 g = gb.AsByte() & collectColorBlueTransformsGreenMask; + Vector128 a = Vector128_.MultiplyHigh(r.AsInt16(), multsr); + Vector128 b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); + Vector128 c = gb.AsByte() - b.AsByte(); + Vector128 d = c - a.AsByte(); + Vector128 e = d & collectColorBlueTransformsBlueMask; ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = e.AsUInt16(); @@ -109,16 +109,16 @@ internal static class ColorSpaceTransformUtils int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); + CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); } } else { - CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo); + CollectColorBlueTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo); } } - private static void CollectColorBlueTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) + private static void CollectColorBlueTransformsScalar(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) { int pos = 0; while (tileHeight-- > 0) @@ -135,11 +135,11 @@ internal static class ColorSpaceTransformUtils public static void CollectColorRedTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) { - if (Avx2.IsSupported && tileWidth >= 16) + if (Vector256.IsHardwareAccelerated && tileWidth >= 16) { Vector256 collectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte(); Vector256 collectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte(); - var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed)); + Vector256 multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed)); const int span = 16; Span values = stackalloc ushort[span]; for (int y = 0; y < tileHeight; y++) @@ -152,15 +152,15 @@ internal static class ColorSpaceTransformUtils nuint input1Idx = x + (span / 2); Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector256 g0 = Avx2.And(input0, collectColorRedTransformsGreenMask256); // 0 0 | g 0 - Vector256 g1 = Avx2.And(input1, collectColorRedTransformsGreenMask256); - Vector256 g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 - Vector256 a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r - Vector256 a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16); - Vector256 a = Avx2.PackUnsignedSaturate(a0, a1); // x r - Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr - Vector256 c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r' - Vector256 d = Avx2.And(c, collectColorRedTransformsAndMask256); // 0 r' + Vector256 g0 = input0 & collectColorRedTransformsGreenMask256; // 0 0 | g 0 + Vector256 g1 = input1 & collectColorRedTransformsGreenMask256; + Vector256 g = Vector256_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 + Vector256 a0 = Vector256.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r + Vector256 a1 = Vector256.ShiftRightLogical(input1.AsInt32(), 16); + Vector256 a = Vector256_.PackUnsignedSaturate(a0, a1); // x r + Vector256 b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); // x dr + Vector256 c = a.AsByte() - b.AsByte(); // x r' + Vector256 d = c & collectColorRedTransformsAndMask256; // 0 r' ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = d.AsUInt16(); @@ -175,14 +175,14 @@ internal static class ColorSpaceTransformUtils int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); + CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); } } - else if (Sse41.IsSupported) + else if (Vector128.IsHardwareAccelerated) { Vector128 collectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte(); Vector128 collectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte(); - var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed)); + Vector128 multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed)); const int span = 8; Span values = stackalloc ushort[span]; for (int y = 0; y < tileHeight; y++) @@ -195,15 +195,15 @@ internal static class ColorSpaceTransformUtils nuint input1Idx = x + (span / 2); Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector128 g0 = Sse2.And(input0, collectColorRedTransformsGreenMask); // 0 0 | g 0 - Vector128 g1 = Sse2.And(input1, collectColorRedTransformsGreenMask); - Vector128 g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 - Vector128 a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r - Vector128 a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16); - Vector128 a = Sse41.PackUnsignedSaturate(a0, a1); // x r - Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr - Vector128 c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r' - Vector128 d = Sse2.And(c, collectColorRedTransformsAndMask); // 0 r' + Vector128 g0 = input0 & collectColorRedTransformsGreenMask; // 0 0 | g 0 + Vector128 g1 = input1 & collectColorRedTransformsGreenMask; + Vector128 g = Vector128_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 + Vector128 a0 = Vector128.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r + Vector128 a1 = Vector128.ShiftRightLogical(input1.AsInt32(), 16); + Vector128 a = Vector128_.PackUnsignedSaturate(a0, a1); // x r + Vector128 b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); // x dr + Vector128 c = a.AsByte() - b.AsByte(); // x r' + Vector128 d = c & collectColorRedTransformsAndMask; // 0 r' ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = d.AsUInt16(); @@ -218,16 +218,16 @@ internal static class ColorSpaceTransformUtils int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); + CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); } } else { - CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo); + CollectColorRedTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToRed, histo); } } - private static void CollectColorRedTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) + private static void CollectColorRedTransformsScalar(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) { int pos = 0; while (tileHeight-- > 0) diff --git a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs index c5e8c975f..6073888fe 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs @@ -71,17 +71,17 @@ public class ColorSpaceTransformUtilsTests public void CollectColorBlueTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.AllowAll); [Fact] - public void CollectColorBlueTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41); + public void CollectColorBlueTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41); [Fact] - public void CollectColorBlueTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2); + public void CollectColorBlueTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2); [Fact] public void CollectColorRedTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.AllowAll); [Fact] - public void CollectColorRedTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41); + public void CollectColorRedTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41); [Fact] - public void CollectColorRedTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2); + public void CollectColorRedTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2); }