diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index ca14ae4c3..5f91dcd99 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -884,23 +884,6 @@ internal static class Numerics accumulator += intHigh; } - /// - /// Reduces elements of the vector into one sum. - /// - /// The accumulator to reduce. - /// The sum of all elements. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int ReduceSum(Vector128 accumulator) - { - // Add odd to even. - Vector128 vsum = Sse2.Add(accumulator, Sse2.Shuffle(accumulator, 0b_11_11_01_01)); - - // Add high to low. - vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10)); - - return Sse2.ConvertToInt32(vsum); - } - /// /// Reduces elements of the vector into one sum. /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 96ddb7976..ff5ea5de3 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -66,9 +66,9 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) || - (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) || - Vector128.IsHardwareAccelerated) + if (Vector512.IsHardwareAccelerated || + Vector256.IsHardwareAccelerated || + Vector128.IsHardwareAccelerated) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -112,9 +112,9 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) || - (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) || - (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)) + if (Vector512.IsHardwareAccelerated || + Vector256.IsHardwareAccelerated || + Vector128.IsHardwareAccelerated) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -158,7 +158,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { int remainder = source.Length % (Vector128.Count * 3); @@ -190,7 +190,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated) { int remainder = source.Length % (Vector128.Count * 3); @@ -223,7 +223,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated) { int remainder = source.Length & ((Vector128.Count * 4) - 1); // bit-hack for modulo @@ -249,7 +249,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) + if (Vector512.IsHardwareAccelerated) { ref Vector512 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector512 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -277,7 +277,7 @@ internal static partial class SimdUtils } } } - else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) + else if (Vector256.IsHardwareAccelerated) { ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -341,7 +341,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) + if (Vector512.IsHardwareAccelerated) { Span temp = stackalloc byte[Vector512.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -373,8 +373,13 @@ internal static partial class SimdUtils } } } - else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) + else if (Vector256.IsHardwareAccelerated) { + // ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb). + // MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte, + // so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F) + // for indexing within each lane, and ignores the upper bits unless bit 7 is set, + // this usage is guaranteed to remain within-lane and non-zeroing. Span temp = stackalloc byte[Vector256.Count]; Shuffle.MMShuffleSpan(ref temp, control); Vector256 mask = Unsafe.As>(ref MemoryMarshal.GetReference(temp)); @@ -391,21 +396,21 @@ internal static partial class SimdUtils ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector256 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector256_.ShuffleNative(vs0, mask); - Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask); - Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask); - Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask); + vd0 = Vector256_.ShufflePerLane(vs0, mask); + Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)1), mask); + Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)2), mask); + Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)3), mask); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask); + Unsafe.Add(ref destinationBase, i) = Vector256_.ShufflePerLane(Unsafe.Add(ref sourceBase, i), mask); } } } - else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte) + else if (Vector128.IsHardwareAccelerated) { Span temp = stackalloc byte[Vector128.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -445,9 +450,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && - Vector128_.SupportsShuffleNativeByte && - Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); @@ -507,10 +510,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && - Vector128_.SupportsShuffleNativeByte && - Vector128_.SupportsShiftByte && - Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 fill = Vector128.Create(0xff000000ff000000ul).AsByte(); @@ -553,10 +553,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && - Vector128_.SupportsShuffleNativeByte && - Vector128_.SupportsShiftByte && - Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); Vector128 maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index dbe0a1fce..a5d377eb9 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -1,10 +1,8 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. -using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.Wasm; @@ -25,43 +23,34 @@ internal static class Vector128_ #pragma warning restore SA1649 // File name should match first type name { /// - /// Gets a value indicating whether shuffle operations are supported. + /// Average packed unsigned 8-bit integers in and , and store the results. /// - public static bool SupportsShuffleNativeByte + /// + /// The first vector containing packed unsigned 8-bit integers to average. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to average. + /// + /// + /// A vector containing the average of the packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Average(Vector128 left, Vector128 right) { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get + if (Sse2.IsSupported) { - if (Vector128.IsHardwareAccelerated) - { - if (RuntimeInformation.ProcessArchitecture is Architecture.X86 or Architecture.X64) - { - return Ssse3.IsSupported; - } - - return true; - } - - return false; + return Sse2.Average(left, right); } - } - /// - /// Gets a value indicating whether right align operations are supported. - /// - public static bool SupportsAlignRight - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Ssse3.IsSupported || AdvSimd.IsSupported; - } + if (AdvSimd.IsSupported) + { + return AdvSimd.FusedAddRoundedHalving(left, right); + } - /// - /// Gets a value indicating whether right or left byte shift operations are supported. - /// - public static bool SupportsShiftByte - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Sse2.IsSupported || AdvSimd.IsSupported; + // Account for potential 9th bit to ensure correct rounded result. + return Vector128.Narrow( + (Vector128.WidenLower(left) + Vector128.WidenLower(right) + Vector128.One) >> 1, + (Vector128.WidenUpper(left) + Vector128.WidenUpper(right) + Vector128.One) >> 1); } /// @@ -88,6 +77,87 @@ internal static class Vector128_ return Vector128.Shuffle(vector, indices); } + /// + /// Creates a new vector by selecting values from an input vector using the control. + /// + /// The input vector from which values are selected. + /// The shuffle control byte. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ShuffleNative(Vector128 vector, [ConstantExpected] byte control) + { + // Don't use InverseMMShuffle here as we want to avoid the cast. + Vector128 indices = Vector128.Create( + control & 0x3, + (control >> 2) & 0x3, + (control >> 4) & 0x3, + (control >> 6) & 0x3); + + return Vector128.Shuffle(vector, indices); + } + + /// + /// Shuffle 16-bit integers in the high 64 bits of using the control in . + /// Store the results in the high 64 bits of the destination, with the low 64 bits being copied from . + /// + /// The input vector containing packed 16-bit integers to shuffle. + /// The shuffle control byte. + /// + /// A vector containing the shuffled 16-bit integers in the high 64 bits, with the low 64 bits copied from . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ShuffleHigh(Vector128 value, [ConstantExpected] byte control) + { + if (Sse2.IsSupported) + { + return Sse2.ShuffleHigh(value, control); + } + + // Don't use InverseMMShuffle here as we want to avoid the cast. + Vector128 indices = Vector128.Create( + 0, + 1, + 2, + 3, + (short)((control & 0x3) + 4), + (short)(((control >> 2) & 0x3) + 4), + (short)(((control >> 4) & 0x3) + 4), + (short)(((control >> 6) & 0x3) + 4)); + + return Vector128.Shuffle(value, indices); + } + + /// + /// Shuffle 16-bit integers in the low 64 bits of using the control in . + /// Store the results in the low 64 bits of the destination, with the high 64 bits being copied from . + /// + /// The input vector containing packed 16-bit integers to shuffle. + /// The shuffle control byte. + /// + /// A vector containing the shuffled 16-bit integers in the low 64 bits, with the high 64 bits copied from . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ShuffleLow(Vector128 value, [ConstantExpected] byte control) + { + if (Sse2.IsSupported) + { + return Sse2.ShuffleLow(value, control); + } + + // Don't use InverseMMShuffle here as we want to avoid the cast. + Vector128 indices = Vector128.Create( + (short)(control & 0x3), + (short)((control >> 2) & 0x3), + (short)((control >> 4) & 0x3), + (short)((control >> 6) & 0x3), + 4, + 5, + 6, + 7); + + return Vector128.Shuffle(value, indices); + } + /// /// Creates a new vector by selecting values from an input vector using a set of indices. /// @@ -133,8 +203,7 @@ internal static class Vector128_ return AdvSimd.ExtractVector128(value, Vector128.Zero, numBytes); } - ThrowUnreachableException(); - return default; + return Vector128.Shuffle(value, Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + Vector128.Create(numBytes)); } /// @@ -158,8 +227,28 @@ internal static class Vector128_ #pragma warning restore CA1857 // A constant is expected for the parameter } - ThrowUnreachableException(); - return default; + return Vector128.Shuffle(value, Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) - Vector128.Create(numBytes)); + } + + /// + /// Shift packed 16-bit integers in left by while + /// shifting in zeros, and store the results + /// + /// The vector containing packed 16-bit integers to shift. + /// The number of bits to shift left. + /// + /// A vector containing the packed 16-bit integers shifted left by , with zeros shifted in. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ShiftLeftLogical(Vector128 value, [ConstantExpected] byte count) + { + // Zero lanes where count >= 16 to match SSE2 + if (count >= 16) + { + return Vector128.Zero; + } + + return value << count; } /// @@ -182,8 +271,9 @@ internal static class Vector128_ return AdvSimd.ExtractVector128(right, left, mask); } - ThrowUnreachableException(); - return default; +#pragma warning disable CA1857 // A constant is expected for the parameter + return ShiftLeftBytesInVector(left, (byte)(Vector128.Count - mask)) | ShiftRightBytesInVector(right, mask); +#pragma warning restore CA1857 // A constant is expected for the parameter } /// @@ -304,6 +394,37 @@ internal static class Vector128_ return Vector128.Narrow(lefClamped, rightClamped); } + /// + /// Packs signed 32-bit integers to unsigned 16-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 PackUnsignedSaturate(Vector128 left, Vector128 right) + { + if (Sse41.IsSupported) + { + return Sse41.PackUnsignedSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right); + } + + Vector128 min = Vector128.Create((int)ushort.MinValue); + Vector128 max = Vector128.Create((int)ushort.MaxValue); + Vector128 lefClamped = Clamp(left, min, max).AsUInt32(); + Vector128 rightClamped = Clamp(right, min, max).AsUInt32(); + return Vector128.Narrow(lefClamped, rightClamped); + } + /// /// Packs signed 32-bit integers to signed 16-bit integers and saturates. /// @@ -335,6 +456,37 @@ internal static class Vector128_ return Vector128.Narrow(lefClamped, rightClamped); } + /// + /// Packs signed 16-bit integers to signed 8-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 PackSignedSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.PackSignedSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertNarrowingSaturateSigned(left, right); + } + + Vector128 min = Vector128.Create((short)sbyte.MinValue); + Vector128 max = Vector128.Create((short)sbyte.MaxValue); + Vector128 lefClamped = Clamp(left, min, max); + Vector128 rightClamped = Clamp(right, min, max); + return Vector128.Narrow(lefClamped, rightClamped); + } + /// /// Restricts a vector between a minimum and a maximum value. /// @@ -347,6 +499,864 @@ internal static class Vector128_ public static Vector128 Clamp(Vector128 value, Vector128 min, Vector128 max) => Vector128.Min(Vector128.Max(value, min), max); - [DoesNotReturn] - private static void ThrowUnreachableException() => throw new UnreachableException(); + /// + /// Multiply packed signed 16-bit integers in and , producing + /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and + /// pack the results. + /// + /// + /// The first vector containing packed signed 16-bit integers to multiply and add. + /// + /// + /// The second vector containing packed signed 16-bit integers to multiply and add. + /// + /// + /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyAddAdjacent(left, right); + } + + if (AdvSimd.IsSupported) + { + Vector128 prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower()); + Vector128 prodHi = AdvSimd.MultiplyWideningUpper(left, right); + + if (AdvSimd.Arm64.IsSupported) + { + return AdvSimd.Arm64.AddPairwise(prodLo, prodHi); + } + + Vector64 v0 = AdvSimd.AddPairwise(prodLo.GetLower(), prodLo.GetUpper()); + Vector64 v1 = AdvSimd.AddPairwise(prodHi.GetLower(), prodHi.GetUpper()); + return Vector128.Create(v0, v1); + } + + { + // Widen each half of the short vectors into two int vectors + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; + + // Extract the low and high parts of the products shuffling them to form a result we can add together. + // Use out-of-bounds to zero out the unused lanes. + Vector128 v0 = Vector128.Shuffle(prodLo, Vector128.Create(0, 2, 8, 8)); + Vector128 v1 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 0, 2)); + Vector128 v2 = Vector128.Shuffle(prodLo, Vector128.Create(1, 3, 8, 8)); + Vector128 v3 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 1, 3)); + + return v0 + v1 + v2 + v3; + } + } + + /// + /// Horizontally add adjacent pairs of 16-bit integers in and , and + /// pack the signed 16-bit results. + /// + /// + /// The first vector containing packed signed 16-bit integers to add. + /// + /// + /// The second vector containing packed signed 16-bit integers to add. + /// + /// + /// A vector containing the results of horizontally adding adjacent pairs of packed signed 16-bit integers + /// + public static Vector128 HorizontalAdd(Vector128 left, Vector128 right) + { + if (Ssse3.IsSupported) + { + return Ssse3.HorizontalAdd(left, right); + } + + if (AdvSimd.Arm64.IsSupported) + { + return AdvSimd.Arm64.AddPairwise(left, right); + } + + if (AdvSimd.IsSupported) + { + Vector128 v0 = AdvSimd.AddPairwiseWidening(left); + Vector128 v1 = AdvSimd.AddPairwiseWidening(right); + + return Vector128.Narrow(v0, v1); + } + + { + // Extract the low and high parts of the products shuffling them to form a result we can add together. + // Use out-of-bounds to zero out the unused lanes. + Vector128 even = Vector128.Create(0, 2, 4, 6, 8, 8, 8, 8); + Vector128 odd = Vector128.Create(1, 3, 5, 7, 8, 8, 8, 8); + Vector128 v0 = Vector128.Shuffle(right, even); + Vector128 v1 = Vector128.Shuffle(right, odd); + Vector128 v2 = Vector128.Shuffle(left, even); + Vector128 v3 = Vector128.Shuffle(left, odd); + + return v0 + v1 + v2 + v3; + } + } + + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the high 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + Vector128 prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower()); + Vector128 prodHi = AdvSimd.MultiplyWideningUpper(left, right); + + prodLo >>= 16; + prodHi >>= 16; + + return Vector128.Narrow(prodLo, prodHi); + } + + { + // Widen each half of the short vectors into two int vectors + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; + + // Arithmetic shift right by 16 bits to extract the high word + prodLo >>= 16; + prodHi >>= 16; + + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } + } + + /// + /// Multiply the packed 16-bit unsigned integers in and , producing + /// intermediate unsigned 32-bit integers, and store the high 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit unsigned integers to multiply. + /// + /// + /// The second vector containing packed 16-bit unsigned integers to multiply. + /// + /// + /// A vector containing the high 16 bits of the products of the packed 16-bit unsigned integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + Vector128 prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower()); + Vector128 prodHi = AdvSimd.MultiplyWideningUpper(left, right); + + prodLo >>= 16; + prodHi >>= 16; + + return Vector128.Narrow(prodLo, prodHi); + } + + { + // Widen each half of the short vectors into two uint vectors + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; + + // Arithmetic shift right by 16 bits to extract the high word + prodLo >>= 16; + prodHi >>= 16; + + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } + } + + /// + /// Unpack and interleave 64-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 64-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 64-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 64-bit integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + return Vector128.Create(left.GetUpper(), right.GetUpper()); + } + + /// + /// Unpack and interleave 64-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 64-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 64-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 64-bit integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + return Vector128.Create(left.GetLower(), right.GetLower()); + } + + /// + /// Unpack and interleave 32-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 32-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 32-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 32-bit integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 2, 1, 3)); + } + + /// + /// Unpack and interleave 32-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 32-bit integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 2, 1, 3)); + } + + /// + /// Unpack and interleave 16-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 16-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 16-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 16-bit integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7)); + } + + /// + /// Unpack and interleave 16-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 16-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 16-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 16-bit integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7)); + } + + /// + /// Unpack and interleave 8-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); + } + + /// + /// Unpack and interleave 8-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); + } + + /// + /// Unpack and interleave 8-bit signed integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit signed integers to unpack from the high half. + /// + /// + /// The second vector containing packed 8-bit signed integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit signed integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); + } + + /// + /// Unpack and interleave 8-bit signed integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit signed integers to unpack from the low half. + /// + /// + /// The second vector containing packed 8-bit signed integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit signed integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); + } + + /// + /// Subtract packed signed 16-bit integers in from packed signed 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed signed 16-bit integers to subtract from. + /// + /// + /// The second vector containing packed signed 16-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed signed 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 32-bit signed + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Clamp to signed 16-bit range + Vector128 min = Vector128.Create((int)short.MinValue); + Vector128 max = Vector128.Create((int)short.MaxValue); + + diffLo = Clamp(diffLo, min, max); + diffHi = Clamp(diffHi, min, max); + + // Narrow back to 16 bit signed. + return Vector128.Narrow(diffLo, diffHi); + } + + /// + /// Subtract packed unsigned 16-bit integers in from packed unsigned 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 16-bit integers to subtract from. + /// + /// + /// The second vector containing packed unsigned 16-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 32-bit signed + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Clamp to signed 16-bit range + Vector128 min = Vector128.Create((uint)ushort.MinValue); + Vector128 max = Vector128.Create((uint)ushort.MaxValue); + + diffLo = Clamp(diffLo, min, max); + diffHi = Clamp(diffHi, min, max); + + // Narrow back to 16 bit signed. + return Vector128.Narrow(diffLo, diffHi); + } + + /// + /// Add packed unsigned 8-bit integers in to packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to add to. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to add. + /// + /// + /// A vector containing the results of adding packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 AddSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.AddSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.AddSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.AddSaturate(left, right); + } + + // Widen inputs to 16-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Add + Vector128 sumLo = leftLo + rightLo; + Vector128 sumHi = leftHi + rightHi; + + // Clamp to signed 8-bit range + Vector128 max = Vector128.Create((ushort)byte.MaxValue); + + sumLo = Clamp(sumLo, Vector128.Zero, max); + sumHi = Clamp(sumHi, Vector128.Zero, max); + + // Narrow back to bytes + return Vector128.Narrow(sumLo, sumHi); + } + + /// + /// Add packed unsigned 16-bit integers in to packed unsigned 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 16-bit integers to add to. + /// + /// + /// The second vector containing packed unsigned 16-bit integers to add. + /// + /// + /// A vector containing the results of adding packed unsigned 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 AddSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.AddSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.AddSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.AddSaturate(left, right); + } + + // Widen inputs to 32-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Add + Vector128 sumLo = leftLo + rightLo; + Vector128 sumHi = leftHi + rightHi; + + // Clamp to signed 16-bit range + Vector128 max = Vector128.Create((uint)ushort.MaxValue); + + sumLo = Clamp(sumLo, Vector128.Zero, max); + sumHi = Clamp(sumHi, Vector128.Zero, max); + + // Narrow back to 16 bit unsigned. + return Vector128.Narrow(sumLo, sumHi); + } + + /// + /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to subtract from. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 16-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Clamp to signed 8-bit range + Vector128 max = Vector128.Create((ushort)byte.MaxValue); + + diffLo = Clamp(diffLo, Vector128.Zero, max); + diffHi = Clamp(diffHi, Vector128.Zero, max); + + // Narrow back to bytes + return Vector128.Narrow(diffLo, diffHi); + } + + /// + /// Add packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to add to. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to add. + /// + /// + /// A vector containing the results of adding packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 AddSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.AddSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.AddSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.AddSaturate(left, right); + } + + // Widen inputs to 16-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Add + Vector128 sumLo = leftLo + rightLo; + Vector128 sumHi = leftHi + rightHi; + + // Clamp to signed 8-bit range + Vector128 min = Vector128.Create((short)sbyte.MinValue); + Vector128 max = Vector128.Create((short)sbyte.MaxValue); + + sumLo = Clamp(sumLo, min, max); + sumHi = Clamp(sumHi, min, max); + + // Narrow back to signed bytes + return Vector128.Narrow(sumLo, sumHi); + } + + /// + /// Subtract packed signed 8-bit integers in from packed signed 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed signed 8-bit integers to subtract from. + /// + /// + /// The second vector containing packed signed 8-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed signed 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 16-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Clamp to signed 8-bit range + Vector128 min = Vector128.Create((short)sbyte.MinValue); + Vector128 max = Vector128.Create((short)sbyte.MaxValue); + + diffLo = Clamp(diffLo, min, max); + diffHi = Clamp(diffHi, min, max); + + // Narrow back to signed bytes + return Vector128.Narrow(diffLo, diffHi); + } } diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 817d6e607..14ac13dd8 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -1,7 +1,6 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. -using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; @@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers; internal static class Vector256_ #pragma warning restore SA1649 // File name should match first type name { - /// - /// Gets a value indicating whether shuffle byte operations are supported. - /// - public static bool SupportsShuffleNativeFloat - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx.IsSupported; - } - - /// - /// Gets a value indicating whether shuffle byte operations are supported. - /// - public static bool SupportsShuffleNativeByte - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx2.IsSupported; - } - /// /// Creates a new vector by selecting values from an input vector using a set of indices. /// @@ -47,15 +28,7 @@ internal static class Vector256_ /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 ShuffleNative(Vector256 vector, [ConstantExpected] byte control) - { - if (Avx.IsSupported) - { - return Avx.Shuffle(vector, vector, control); - } - - ThrowUnreachableException(); - return default; - } + => Avx.Shuffle(vector, vector, control); /// /// Creates a new vector by selecting values from an input vector using a set of indices. @@ -66,15 +39,17 @@ internal static class Vector256_ /// /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 ShuffleNative(Vector256 vector, Vector256 indices) + public static Vector256 ShufflePerLane(Vector256 vector, Vector256 indices) { if (Avx2.IsSupported) { return Avx2.Shuffle(vector, indices); } - ThrowUnreachableException(); - return default; + Vector128 indicesLo = indices.GetLower(); + Vector128 lower = Vector128_.ShuffleNative(vector.GetLower(), indicesLo); + Vector128 upper = Vector128_.ShuffleNative(vector.GetUpper(), indicesLo); + return Vector256.Create(lower, upper); } /// @@ -162,6 +137,54 @@ internal static class Vector256_ return (vm0 * vm1) - vs; } + /// + /// Multiply packed signed 16-bit integers in and , producing + /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and + /// pack the results. + /// + /// + /// The first vector containing packed signed 16-bit integers to multiply and add. + /// + /// + /// The second vector containing packed signed 16-bit integers to multiply and add. + /// + /// + /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplyAddAdjacent(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.MultiplyAddAdjacent(left, right); + } + + return Vector256.Create( + Vector128_.MultiplyAddAdjacent(left.GetLower(), right.GetLower()), + Vector128_.MultiplyAddAdjacent(left.GetUpper(), right.GetUpper())); + } + + /// + /// Packs signed 32-bit integers to signed 16-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 PackUnsignedSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.PackUnsignedSaturate(left, right); + } + + Vector256 min = Vector256.Create((int)ushort.MinValue); + Vector256 max = Vector256.Create((int)ushort.MaxValue); + Vector256 lefClamped = Clamp(left, min, max).AsUInt32(); + Vector256 rightClamped = Clamp(right, min, max).AsUInt32(); + return Vector256.Narrow(lefClamped, rightClamped); + } + /// /// Packs signed 32-bit integers to signed 16-bit integers and saturates. /// @@ -183,6 +206,27 @@ internal static class Vector256_ return Vector256.Narrow(lefClamped, rightClamped); } + /// + /// Packs signed 16-bit integers to signed 8-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 PackSignedSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.PackSignedSaturate(left, right); + } + + Vector256 min = Vector256.Create((short)sbyte.MinValue); + Vector256 max = Vector256.Create((short)sbyte.MaxValue); + Vector256 lefClamped = Clamp(left, min, max); + Vector256 rightClamped = Clamp(right, min, max); + return Vector256.Narrow(lefClamped, rightClamped); + } + /// /// Restricts a vector between a minimum and a maximum value. /// @@ -210,6 +254,211 @@ internal static class Vector256_ return Vector256.WidenLower(value.ToVector256()); } - [DoesNotReturn] - private static void ThrowUnreachableException() => throw new UnreachableException(); + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the low 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplyLow(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.MultiplyLow(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector256 leftLower, Vector256 leftUpper) = Vector256.Widen(left); + (Vector256 rightLower, Vector256 rightUpper) = Vector256.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector256 prodLo = leftLower * rightLower; + Vector256 prodHi = leftUpper * rightUpper; + + // Narrow the two int vectors back into one short vector + return Vector256.Narrow(prodLo, prodHi); + } + + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the high 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplyHigh(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.MultiplyHigh(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector256 leftLower, Vector256 leftUpper) = Vector256.Widen(left); + (Vector256 rightLower, Vector256 rightUpper) = Vector256.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector256 prodLo = leftLower * rightLower; + Vector256 prodHi = leftUpper * rightUpper; + + // Arithmetic shift right by 16 bits to extract the high word + prodLo >>= 16; + prodHi >>= 16; + + // Narrow the two int vectors back into one short vector + return Vector256.Narrow(prodLo, prodHi); + } + + /// + /// Unpack and interleave 32-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 32-bit integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 UnpackLow(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.UnpackLow(left, right); + } + + Vector128 lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower()); + Vector128 hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper()); + + return Vector256.Create(lo, hi); + } + + /// + /// Unpack and interleave 8-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 UnpackHigh(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.UnpackHigh(left, right); + } + + Vector128 lo = Vector128_.UnpackHigh(left.GetLower(), right.GetLower()); + Vector128 hi = Vector128_.UnpackHigh(left.GetUpper(), right.GetUpper()); + + return Vector256.Create(lo, hi); + } + + /// + /// Unpack and interleave 8-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 UnpackLow(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.UnpackLow(left, right); + } + + Vector128 lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower()); + Vector128 hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper()); + + return Vector256.Create(lo, hi); + } + + /// + /// Subtract packed signed 16-bit integers in from packed signed 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed signed 16-bit integers to subtract from. + /// + /// + /// The second vector containing packed signed 16-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 SubtractSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.SubtractSaturate(left, right); + } + + return Vector256.Create( + Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()), + Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper())); + } + + /// + /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to subtract from. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 SubtractSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.SubtractSaturate(left, right); + } + + return Vector256.Create( + Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()), + Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper())); + } } diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index 63de5dc10..03ee4626c 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -1,7 +1,6 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. -using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; @@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers; internal static class Vector512_ #pragma warning restore SA1649 // File name should match first type name { - /// - /// Gets a value indicating whether shuffle float operations are supported. - /// - public static bool SupportsShuffleNativeFloat - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx512F.IsSupported; - } - - /// - /// Gets a value indicating whether shuffle byte operations are supported. - /// - public static bool SupportsShuffleNativeByte - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx512BW.IsSupported; - } - /// /// Creates a new vector by selecting values from an input vector using the control. /// @@ -47,15 +28,7 @@ internal static class Vector512_ /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 ShuffleNative(Vector512 vector, [ConstantExpected] byte control) - { - if (Avx512F.IsSupported) - { - return Avx512F.Shuffle(vector, vector, control); - } - - ThrowUnreachableException(); - return default; - } + => Avx512F.Shuffle(vector, vector, control); /// /// Creates a new vector by selecting values from an input vector using a set of indices. @@ -73,8 +46,7 @@ internal static class Vector512_ return Avx512BW.Shuffle(vector, indices); } - ThrowUnreachableException(); - return default; + return Vector512.Shuffle(vector, indices); } /// @@ -85,25 +57,7 @@ internal static class Vector512_ /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 ConvertToInt32RoundToEven(Vector512 vector) - { - if (Avx512F.IsSupported) - { - return Avx512F.ConvertToVector512Int32(vector); - } - - if (Avx.IsSupported) - { - Vector256 lower = Avx.ConvertToVector256Int32(vector.GetLower()); - Vector256 upper = Avx.ConvertToVector256Int32(vector.GetUpper()); - return Vector512.Create(lower, upper); - } - - Vector512 sign = vector & Vector512.Create(-0.0f); - Vector512 val_2p23_f32 = sign | Vector512.Create(8388608.0f); - - val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32; - return Vector512.ConvertToInt32(val_2p23_f32 | sign); - } + => Avx512F.ConvertToVector512Int32(vector); /// /// Rounds all values in to the nearest integer @@ -112,28 +66,11 @@ internal static class Vector512_ /// The vector [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 RoundToNearestInteger(Vector512 vector) - { - if (Avx512F.IsSupported) - { - // imm8 = 0b1000: - // imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers) - // imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions) - return Avx512F.RoundScale(vector, 0b0000_1000); - } - - if (Avx.IsSupported) - { - Vector256 lower = Avx.RoundToNearestInteger(vector.GetLower()); - Vector256 upper = Avx.RoundToNearestInteger(vector.GetUpper()); - return Vector512.Create(lower, upper); - } - - Vector512 sign = vector & Vector512.Create(-0F); - Vector512 val_2p23_f32 = sign | Vector512.Create(8388608F); - val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32; - return val_2p23_f32 | sign; - } + // imm8 = 0b1000: + // imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers) + // imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions) + => Avx512F.RoundScale(vector, 0b0000_1000); /// /// Performs a multiplication and an addition of the . @@ -148,21 +85,7 @@ internal static class Vector512_ Vector512 va, Vector512 vm0, Vector512 vm1) - { - if (Avx512F.IsSupported) - { - return Avx512F.FusedMultiplyAdd(vm0, vm1, va); - } - - if (Fma.IsSupported) - { - Vector256 lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower()); - Vector256 upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper()); - return Vector512.Create(lower, upper); - } - - return va + (vm0 * vm1); - } + => Avx512F.FusedMultiplyAdd(vm0, vm1, va); /// /// Restricts a vector between a minimum and a maximum value. @@ -175,7 +98,4 @@ internal static class Vector512_ [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 Clamp(Vector512 value, Vector512 min, Vector512 max) => Vector512.Min(Vector512.Max(value, min), max); - - [DoesNotReturn] - private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.RgbScalar.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.RgbScalar.cs index 4b2abc0ac..92be9e896 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.RgbScalar.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.RgbScalar.cs @@ -75,6 +75,7 @@ internal abstract partial class JpegColorConverterBase internal static void ConvertFromRgb(ComponentValues values, Span rLane, Span gLane, Span bLane) { + // TODO: This doesn't seem correct. We should be scaling to the maximum value here. rLane.CopyTo(values.Component0); gLane.CopyTo(values.Component1); bLane.CopyTo(values.Component2); diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykScalar.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykScalar.cs new file mode 100644 index 000000000..27449a368 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykScalar.cs @@ -0,0 +1,118 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Buffers; +using System.Numerics; +using System.Runtime.InteropServices; +using SixLabors.ImageSharp.ColorProfiles; +using SixLabors.ImageSharp.ColorProfiles.Icc; +using SixLabors.ImageSharp.Metadata.Profiles.Icc; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +internal abstract partial class JpegColorConverterBase +{ + /// + /// Color converter for tiff images, which use the jpeg compression and CMYK colorspace. + /// + internal sealed class TiffCmykScalar : JpegColorConverterScalar + { + public TiffCmykScalar(int precision) + : base(JpegColorSpace.TiffCmyk, precision) + { + } + + /// + public override void ConvertToRgbInPlace(in ComponentValues values) + => ConvertToRgbInPlace(in values, this.MaximumValue); + + /// + public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile) + => ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue); + + public override void ConvertFromRgb(in ComponentValues values, Span rLane, Span gLane, Span bLane) + => ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane); + + public static void ConvertToRgbInPlace(in ComponentValues values, float maxValue) + { + Span c0 = values.Component0; + Span c1 = values.Component1; + Span c2 = values.Component2; + Span c3 = values.Component3; + + float scale = 1 / maxValue; + for (int i = 0; i < c0.Length; i++) + { + float c = c0[i] * scale; + float m = c1[i] * scale; + float y = c2[i] * scale; + float k = 1 - (c3[i] * scale); + + c0[i] = (1 - c) * k; + c1[i] = (1 - m) * k; + c2[i] = (1 - y) * k; + } + } + + public static void ConvertFromRgb(in ComponentValues values, float maxValue, Span rLane, Span gLane, Span bLane) + { + Span c = values.Component0; + Span m = values.Component1; + Span y = values.Component2; + Span k = values.Component3; + + for (int i = 0; i < c.Length; i++) + { + float ctmp = 255F - rLane[i]; + float mtmp = 255F - gLane[i]; + float ytmp = 255F - bLane[i]; + float ktmp = MathF.Min(MathF.Min(ctmp, mtmp), ytmp); + + if (ktmp >= 255F) + { + ctmp = 0F; + mtmp = 0F; + ytmp = 0F; + } + else + { + float divisor = 1 / (255F - ktmp); + ctmp = (ctmp - ktmp) * divisor; + mtmp = (mtmp - ktmp) * divisor; + ytmp = (ytmp - ktmp) * divisor; + } + + c[i] = ctmp * maxValue; + m[i] = mtmp * maxValue; + y[i] = ytmp * maxValue; + k[i] = ktmp; + } + } + + public static void ConvertToRgbInPlaceWithIcc(Configuration configuration, IccProfile profile, in ComponentValues values, float maxValue) + { + using IMemoryOwner memoryOwner = configuration.MemoryAllocator.Allocate(values.Component0.Length * 4); + Span packed = memoryOwner.Memory.Span; + + Span c0 = values.Component0; + Span c1 = values.Component1; + Span c2 = values.Component2; + Span c3 = values.Component3; + + PackedNormalizeInterleave4(c0, c1, c2, c3, packed, maxValue); + + Span source = MemoryMarshal.Cast(packed); + Span destination = MemoryMarshal.Cast(packed)[..source.Length]; + + ColorConversionOptions options = new() + { + SourceIccProfile = profile, + TargetIccProfile = CompactSrgbV4Profile.Profile, + }; + ColorProfileConverter converter = new(options); + converter.Convert(source, destination); + + UnpackDeinterleave3(MemoryMarshal.Cast(packed)[..source.Length], c0, c1, c2); + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector128.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector128.cs new file mode 100644 index 000000000..6d52d5c72 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector128.cs @@ -0,0 +1,99 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using SixLabors.ImageSharp.Metadata.Profiles.Icc; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +internal abstract partial class JpegColorConverterBase +{ + internal sealed class TiffCmykVector128 : JpegColorConverterVector128 + { + public TiffCmykVector128(int precision) + : base(JpegColorSpace.TiffCmyk, precision) + { + } + + /// + public override void ConvertToRgbInPlace(in ComponentValues values) + { + ref Vector128 c0Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector128 c1Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector128 c2Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector128 c3Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + Vector128 scale = Vector128.Create(1 / this.MaximumValue); + + nuint n = values.Component0.Vector128Count(); + for (nuint i = 0; i < n; i++) + { + ref Vector128 c = ref Unsafe.Add(ref c0Base, i); + ref Vector128 m = ref Unsafe.Add(ref c1Base, i); + ref Vector128 y = ref Unsafe.Add(ref c2Base, i); + Vector128 k = Unsafe.Add(ref c3Base, i); + + k = Vector128.One - (k * scale); + c = (Vector128.One - (c * scale)) * k; + m = (Vector128.One - (m * scale)) * k; + y = (Vector128.One - (y * scale)) * k; + } + } + + /// + public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile) + => TiffCmykScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue); + + /// + public override void ConvertFromRgb(in ComponentValues values, Span rLane, Span gLane, Span bLane) + => ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane); + + public static void ConvertFromRgb(in ComponentValues values, float maxValue, Span rLane, Span gLane, Span bLane) + { + ref Vector128 destC = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector128 destM = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector128 destY = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector128 destK = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + ref Vector128 srcR = + ref Unsafe.As>(ref MemoryMarshal.GetReference(rLane)); + ref Vector128 srcG = + ref Unsafe.As>(ref MemoryMarshal.GetReference(gLane)); + ref Vector128 srcB = + ref Unsafe.As>(ref MemoryMarshal.GetReference(bLane)); + + Vector128 scale = Vector128.Create(maxValue); + + nuint n = values.Component0.Vector128Count(); + for (nuint i = 0; i < n; i++) + { + Vector128 ctmp = scale - Unsafe.Add(ref srcR, i); + Vector128 mtmp = scale - Unsafe.Add(ref srcG, i); + Vector128 ytmp = scale - Unsafe.Add(ref srcB, i); + Vector128 ktmp = Vector128.Min(ctmp, Vector128.Min(mtmp, ytmp)); + + Vector128 kMask = ~Vector128.Equals(ktmp, scale); + Vector128 divisor = Vector128.One / (scale - ktmp); + + ctmp = ((ctmp - ktmp) * divisor) & kMask; + mtmp = ((mtmp - ktmp) * divisor) & kMask; + ytmp = ((ytmp - ktmp) * divisor) & kMask; + + Unsafe.Add(ref destC, i) = ctmp * scale; + Unsafe.Add(ref destM, i) = mtmp * scale; + Unsafe.Add(ref destY, i) = ytmp * scale; + Unsafe.Add(ref destK, i) = ktmp; + } + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector256.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector256.cs new file mode 100644 index 000000000..61b312a06 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector256.cs @@ -0,0 +1,99 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using SixLabors.ImageSharp.Metadata.Profiles.Icc; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +internal abstract partial class JpegColorConverterBase +{ + internal sealed class TiffCmykVector256 : JpegColorConverterVector256 + { + public TiffCmykVector256(int precision) + : base(JpegColorSpace.TiffCmyk, precision) + { + } + + /// + public override void ConvertToRgbInPlace(in ComponentValues values) + { + ref Vector256 c0Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector256 c1Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector256 c2Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector256 c3Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + Vector256 scale = Vector256.Create(1 / this.MaximumValue); + + nuint n = values.Component0.Vector256Count(); + for (nuint i = 0; i < n; i++) + { + ref Vector256 c = ref Unsafe.Add(ref c0Base, i); + ref Vector256 m = ref Unsafe.Add(ref c1Base, i); + ref Vector256 y = ref Unsafe.Add(ref c2Base, i); + Vector256 k = Unsafe.Add(ref c3Base, i); + + k = Vector256.One - (k * scale); + c = (Vector256.One - (c * scale)) * k; + m = (Vector256.One - (m * scale)) * k; + y = (Vector256.One - (y * scale)) * k; + } + } + + /// + public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile) + => CmykScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue); + + /// + public override void ConvertFromRgb(in ComponentValues values, Span rLane, Span gLane, Span bLane) + => ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane); + + public static void ConvertFromRgb(in ComponentValues values, float maxValue, Span rLane, Span gLane, Span bLane) + { + ref Vector256 destC = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector256 destM = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector256 destY = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector256 destK = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + ref Vector256 srcR = + ref Unsafe.As>(ref MemoryMarshal.GetReference(rLane)); + ref Vector256 srcG = + ref Unsafe.As>(ref MemoryMarshal.GetReference(gLane)); + ref Vector256 srcB = + ref Unsafe.As>(ref MemoryMarshal.GetReference(bLane)); + + Vector256 scale = Vector256.Create(maxValue); + + nuint n = values.Component0.Vector256Count(); + for (nuint i = 0; i < n; i++) + { + Vector256 ctmp = scale - Unsafe.Add(ref srcR, i); + Vector256 mtmp = scale - Unsafe.Add(ref srcG, i); + Vector256 ytmp = scale - Unsafe.Add(ref srcB, i); + Vector256 ktmp = Vector256.Min(ctmp, Vector256.Min(mtmp, ytmp)); + + Vector256 kMask = ~Vector256.Equals(ktmp, scale); + Vector256 divisor = Vector256.One / (scale - ktmp); + + ctmp = ((ctmp - ktmp) * divisor) & kMask; + mtmp = ((mtmp - ktmp) * divisor) & kMask; + ytmp = ((ytmp - ktmp) * divisor) & kMask; + + Unsafe.Add(ref destC, i) = ctmp * scale; + Unsafe.Add(ref destM, i) = mtmp * scale; + Unsafe.Add(ref destY, i) = ytmp * scale; + Unsafe.Add(ref destK, i) = ktmp; + } + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector512.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector512.cs new file mode 100644 index 000000000..51d5cc76d --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector512.cs @@ -0,0 +1,108 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using SixLabors.ImageSharp.Metadata.Profiles.Icc; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +internal abstract partial class JpegColorConverterBase +{ + internal sealed class TiffCmykVector512 : JpegColorConverterVector512 + { + public TiffCmykVector512(int precision) + : base(JpegColorSpace.TiffCmyk, precision) + { + } + + /// + public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile) + => TiffCmykScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue); + + /// + protected override void ConvertToRgbInPlaceVectorized(in ComponentValues values) + { + ref Vector512 c0Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector512 c1Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector512 c2Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector512 c3Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + // Used for the color conversion + Vector512 scale = Vector512.Create(1 / this.MaximumValue); + + nuint n = values.Component0.Vector512Count(); + for (nuint i = 0; i < n; i++) + { + ref Vector512 c = ref Unsafe.Add(ref c0Base, i); + ref Vector512 m = ref Unsafe.Add(ref c1Base, i); + ref Vector512 y = ref Unsafe.Add(ref c2Base, i); + Vector512 k = Unsafe.Add(ref c3Base, i); + + k = Vector512.One - (k * scale); + c = (Vector512.One - (c * scale)) * k; + m = (Vector512.One - (m * scale)) * k; + y = (Vector512.One - (y * scale)) * k; + } + } + + /// + protected override void ConvertFromRgbVectorized(in ComponentValues values, Span rLane, Span gLane, Span bLane) + => ConvertFromRgbVectorized(in values, this.MaximumValue, rLane, gLane, bLane); + + /// + protected override void ConvertToRgbInPlaceScalarRemainder(in ComponentValues values) + => TiffCmykScalar.ConvertToRgbInPlace(values, this.MaximumValue); + + /// + protected override void ConvertFromRgbScalarRemainder(in ComponentValues values, Span rLane, Span gLane, Span bLane) + => TiffCmykScalar.ConvertFromRgb(values, this.MaximumValue, rLane, gLane, bLane); + + internal static void ConvertFromRgbVectorized(in ComponentValues values, float maxValue, Span rLane, Span gLane, Span bLane) + { + ref Vector512 destC = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector512 destM = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector512 destY = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector512 destK = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + ref Vector512 srcR = + ref Unsafe.As>(ref MemoryMarshal.GetReference(rLane)); + ref Vector512 srcG = + ref Unsafe.As>(ref MemoryMarshal.GetReference(gLane)); + ref Vector512 srcB = + ref Unsafe.As>(ref MemoryMarshal.GetReference(bLane)); + + Vector512 scale = Vector512.Create(maxValue); + + nuint n = values.Component0.Vector512Count(); + for (nuint i = 0; i < n; i++) + { + Vector512 ctmp = scale - Unsafe.Add(ref srcR, i); + Vector512 mtmp = scale - Unsafe.Add(ref srcG, i); + Vector512 ytmp = scale - Unsafe.Add(ref srcB, i); + Vector512 ktmp = Vector512.Min(ctmp, Vector512.Min(mtmp, ytmp)); + + Vector512 kMask = ~Vector512.Equals(ktmp, scale); + Vector512 divisor = Vector512.One / (scale - ktmp); + + ctmp = ((ctmp - ktmp) * divisor) & kMask; + mtmp = ((mtmp - ktmp) * divisor) & kMask; + ytmp = ((ytmp - ktmp) * divisor) & kMask; + + Unsafe.Add(ref destC, i) = ctmp * scale; + Unsafe.Add(ref destM, i) = mtmp * scale; + Unsafe.Add(ref destY, i) = ytmp * scale; + Unsafe.Add(ref destK, i) = ktmp; + } + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKScalar.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKScalar.cs new file mode 100644 index 000000000..495a20831 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKScalar.cs @@ -0,0 +1,153 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Buffers; +using System.Numerics; +using System.Runtime.InteropServices; +using SixLabors.ImageSharp.ColorProfiles; +using SixLabors.ImageSharp.ColorProfiles.Icc; +using SixLabors.ImageSharp.Metadata.Profiles.Icc; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +internal abstract partial class JpegColorConverterBase +{ + /// + /// Color converter for tiff images, which use the jpeg compression and CMYK colorspace. + /// + internal sealed class TiffYccKScalar : JpegColorConverterScalar + { + // Derived from ITU-T Rec. T.871 + internal const float RCrMult = 1.402f; + internal const float GCbMult = (float)(0.114 * 1.772 / 0.587); + internal const float GCrMult = (float)(0.299 * 1.402 / 0.587); + internal const float BCbMult = 1.772f; + + public TiffYccKScalar(int precision) + : base(JpegColorSpace.TiffYccK, precision) + { + } + + /// + public override void ConvertToRgbInPlace(in ComponentValues values) + => ConvertToRgbInPlace(in values, this.MaximumValue, this.HalfValue); + + /// + public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile) + => ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue); + + public override void ConvertFromRgb(in ComponentValues values, Span rLane, Span gLane, Span bLane) + => ConvertFromRgb(values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane); + + public static void ConvertToRgbInPlace(in ComponentValues values, float maxValue, float halfValue) + { + Span c0 = values.Component0; + Span c1 = values.Component1; + Span c2 = values.Component2; + Span c3 = values.Component3; + + float scale = 1F / maxValue; + halfValue *= scale; + + for (int i = 0; i < values.Component0.Length; i++) + { + float y = c0[i] * scale; + float cb = (c1[i] * scale) - halfValue; + float cr = (c2[i] * scale) - halfValue; + float scaledK = 1 - (c3[i] * scale); + + // r = y + (1.402F * cr); + // g = y - (0.344136F * cb) - (0.714136F * cr); + // b = y + (1.772F * cb); + c0[i] = (y + (RCrMult * cr)) * scaledK; + c1[i] = (y - (GCbMult * cb) - (GCrMult * cr)) * scaledK; + c2[i] = (y + (BCbMult * cb)) * scaledK; + } + } + + public static void ConvertFromRgb(in ComponentValues values, float halfValue, float maxValue, Span rLane, Span gLane, Span bLane) + { + Span y = values.Component0; + Span cb = values.Component1; + Span cr = values.Component2; + Span k = values.Component3; + + for (int i = 0; i < cr.Length; i++) + { + // Scale down to [0-1] + const float divisor = 1F / 255F; + float r = rLane[i] * divisor; + float g = gLane[i] * divisor; + float b = bLane[i] * divisor; + + float ytmp; + float cbtmp; + float crtmp; + float ktmp = 1F - MathF.Max(r, MathF.Max(g, b)); + + if (ktmp >= 1F) + { + ytmp = 0F; + cbtmp = 0.5F; + crtmp = 0.5F; + ktmp = maxValue; + } + else + { + float kmask = 1F / (1F - ktmp); + r *= kmask; + g *= kmask; + b *= kmask; + + // Scale to [0-maxValue] + ytmp = ((0.299f * r) + (0.587f * g) + (0.114f * b)) * maxValue; + cbtmp = halfValue - (((0.168736f * r) - (0.331264f * g) + (0.5f * b)) * maxValue); + crtmp = halfValue + (((0.5f * r) - (0.418688f * g) - (0.081312f * b)) * maxValue); + ktmp *= maxValue; + } + + y[i] = ytmp; + cb[i] = cbtmp; + cr[i] = crtmp; + k[i] = ktmp; + } + } + + public static void ConvertToRgbInPlaceWithIcc(Configuration configuration, IccProfile profile, in ComponentValues values, float maxValue) + { + using IMemoryOwner memoryOwner = configuration.MemoryAllocator.Allocate(values.Component0.Length * 4); + Span packed = memoryOwner.Memory.Span; + + Span c0 = values.Component0; + Span c1 = values.Component1; + Span c2 = values.Component2; + Span c3 = values.Component3; + + PackedNormalizeInterleave4(c0, c1, c2, c3, packed, maxValue); + + ColorProfileConverter converter = new(); + Span source = MemoryMarshal.Cast(packed); + + // YccK is not a defined ICC color space — it's a JPEG-specific encoding used in Adobe-style CMYK JPEGs. + // ICC profiles expect colorimetric CMYK values, so we must first convert YccK to CMYK using a hardcoded inverse transform. + // This transform assumes Rec.601 YCbCr coefficients and an inverted K channel. + // + // The YccK => Cmyk conversion is independent of any embedded ICC profile. + // Since the same RGB working space is used during conversion to and from XYZ, + // colorimetric accuracy is preserved. + converter.Convert(MemoryMarshal.Cast(source), source); + + Span destination = MemoryMarshal.Cast(packed)[..source.Length]; + + ColorConversionOptions options = new() + { + SourceIccProfile = profile, + TargetIccProfile = CompactSrgbV4Profile.Profile, + }; + converter = new(options); + converter.Convert(source, destination); + + UnpackDeinterleave3(MemoryMarshal.Cast(packed)[..source.Length], c0, c1, c2); + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector128.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector128.cs new file mode 100644 index 000000000..b360c373a --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector128.cs @@ -0,0 +1,131 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using SixLabors.ImageSharp.Common.Helpers; +using SixLabors.ImageSharp.Metadata.Profiles.Icc; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +internal abstract partial class JpegColorConverterBase +{ + internal sealed class TiffYccKVector128 : JpegColorConverterVector128 + { + public TiffYccKVector128(int precision) + : base(JpegColorSpace.TiffYccK, precision) + { + } + + /// + public override void ConvertToRgbInPlace(in ComponentValues values) + { + ref Vector128 c0Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector128 c1Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector128 c2Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector128 c3Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + Vector128 scale = Vector128.Create(1F / this.MaximumValue); + Vector128 chromaOffset = Vector128.Create(this.HalfValue) * scale; + Vector128 rCrMult = Vector128.Create(YCbCrScalar.RCrMult); + Vector128 gCbMult = Vector128.Create(-YCbCrScalar.GCbMult); + Vector128 gCrMult = Vector128.Create(-YCbCrScalar.GCrMult); + Vector128 bCbMult = Vector128.Create(YCbCrScalar.BCbMult); + + nuint n = values.Component0.Vector128Count(); + for (nuint i = 0; i < n; i++) + { + ref Vector128 c0 = ref Unsafe.Add(ref c0Base, i); + ref Vector128 c1 = ref Unsafe.Add(ref c1Base, i); + ref Vector128 c2 = ref Unsafe.Add(ref c2Base, i); + ref Vector128 c3 = ref Unsafe.Add(ref c3Base, i); + + Vector128 y = c0 * scale; + Vector128 cb = (c1 * scale) - chromaOffset; + Vector128 cr = (c2 * scale) - chromaOffset; + Vector128 scaledK = Vector128.One - (c3 * scale); + + // r = y + (1.402F * cr); + // g = y - (0.344136F * cb) - (0.714136F * cr); + // b = y + (1.772F * cb); + Vector128 r = Vector128_.MultiplyAdd(y, cr, rCrMult) * scaledK; + Vector128 g = Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(y, cb, gCbMult), cr, gCrMult) * scaledK; + Vector128 b = Vector128_.MultiplyAdd(y, cb, bCbMult) * scaledK; + + c0 = r; + c1 = g; + c2 = b; + } + } + + /// + public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile) + => TiffYccKScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue); + + /// + public override void ConvertFromRgb(in ComponentValues values, Span rLane, Span gLane, Span bLane) + { + ref Vector128 srcR = + ref Unsafe.As>(ref MemoryMarshal.GetReference(rLane)); + ref Vector128 srcG = + ref Unsafe.As>(ref MemoryMarshal.GetReference(gLane)); + ref Vector128 srcB = + ref Unsafe.As>(ref MemoryMarshal.GetReference(bLane)); + + ref Vector128 destY = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector128 destCb = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector128 destCr = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector128 destK = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + Vector128 maxSourceValue = Vector128.Create(1 / 255F); + Vector128 maxSampleValue = Vector128.Create(this.MaximumValue); + Vector128 chromaOffset = Vector128.Create(this.HalfValue); + + Vector128 f0299 = Vector128.Create(0.299f); + Vector128 f0587 = Vector128.Create(0.587f); + Vector128 f0114 = Vector128.Create(0.114f); + Vector128 fn0168736 = Vector128.Create(-0.168736f); + Vector128 fn0331264 = Vector128.Create(-0.331264f); + Vector128 fn0418688 = Vector128.Create(-0.418688f); + Vector128 fn0081312F = Vector128.Create(-0.081312F); + Vector128 f05 = Vector128.Create(0.5f); + + nuint n = values.Component0.Vector128Count(); + for (nuint i = 0; i < n; i++) + { + Vector128 r = Unsafe.Add(ref srcR, i) * maxSourceValue; + Vector128 g = Unsafe.Add(ref srcG, i) * maxSourceValue; + Vector128 b = Unsafe.Add(ref srcB, i) * maxSourceValue; + Vector128 ktmp = Vector128.One - Vector128.Max(r, Vector128.Min(g, b)); + + Vector128 kMask = ~Vector128.Equals(ktmp, Vector128.One); + Vector128 divisor = Vector128.One / (Vector128.One - ktmp); + + r = (r * divisor) & kMask; + g = (g * divisor) & kMask; + b = (b * divisor) & kMask; + + // y = 0 + (0.299 * r) + (0.587 * g) + (0.114 * b) + // cb = 128 - (0.168736 * r) - (0.331264 * g) + (0.5 * b) + // cr = 128 + (0.5 * r) - (0.418688 * g) - (0.081312 * b) + Vector128 y = Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(f0114 * b, f0587, g), f0299, r); + Vector128 cb = chromaOffset + Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(f05 * b, fn0331264, g), fn0168736, r); + Vector128 cr = chromaOffset + Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(fn0081312F * b, fn0418688, g), f05, r); + + Unsafe.Add(ref destY, i) = y * maxSampleValue; + Unsafe.Add(ref destCb, i) = chromaOffset + (cb * maxSampleValue); + Unsafe.Add(ref destCr, i) = chromaOffset + (cr * maxSampleValue); + Unsafe.Add(ref destK, i) = ktmp * maxSampleValue; + } + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector256.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector256.cs new file mode 100644 index 000000000..f996522d3 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector256.cs @@ -0,0 +1,131 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using SixLabors.ImageSharp.Common.Helpers; +using SixLabors.ImageSharp.Metadata.Profiles.Icc; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +internal abstract partial class JpegColorConverterBase +{ + internal sealed class TiffYccKVector256 : JpegColorConverterVector256 + { + public TiffYccKVector256(int precision) + : base(JpegColorSpace.TiffYccK, precision) + { + } + + /// + public override void ConvertToRgbInPlace(in ComponentValues values) + { + ref Vector256 c0Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector256 c1Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector256 c2Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector256 c3Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + Vector256 scale = Vector256.Create(1F / this.MaximumValue); + Vector256 chromaOffset = Vector256.Create(this.HalfValue) * scale; + Vector256 rCrMult = Vector256.Create(YCbCrScalar.RCrMult); + Vector256 gCbMult = Vector256.Create(-YCbCrScalar.GCbMult); + Vector256 gCrMult = Vector256.Create(-YCbCrScalar.GCrMult); + Vector256 bCbMult = Vector256.Create(YCbCrScalar.BCbMult); + + nuint n = values.Component0.Vector256Count(); + for (nuint i = 0; i < n; i++) + { + ref Vector256 c0 = ref Unsafe.Add(ref c0Base, i); + ref Vector256 c1 = ref Unsafe.Add(ref c1Base, i); + ref Vector256 c2 = ref Unsafe.Add(ref c2Base, i); + ref Vector256 c3 = ref Unsafe.Add(ref c3Base, i); + + Vector256 y = c0 * scale; + Vector256 cb = (c1 * scale) - chromaOffset; + Vector256 cr = (c2 * scale) - chromaOffset; + Vector256 scaledK = Vector256.One - (c3 * scale); + + // r = y + (1.402F * cr); + // g = y - (0.344136F * cb) - (0.714136F * cr); + // b = y + (1.772F * cb); + Vector256 r = Vector256_.MultiplyAdd(y, cr, rCrMult) * scaledK; + Vector256 g = Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(y, cb, gCbMult), cr, gCrMult) * scaledK; + Vector256 b = Vector256_.MultiplyAdd(y, cb, bCbMult) * scaledK; + + c0 = r; + c1 = g; + c2 = b; + } + } + + /// + public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile) + => TiffYccKScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue); + + /// + public override void ConvertFromRgb(in ComponentValues values, Span rLane, Span gLane, Span bLane) + { + ref Vector256 srcR = + ref Unsafe.As>(ref MemoryMarshal.GetReference(rLane)); + ref Vector256 srcG = + ref Unsafe.As>(ref MemoryMarshal.GetReference(gLane)); + ref Vector256 srcB = + ref Unsafe.As>(ref MemoryMarshal.GetReference(bLane)); + + ref Vector256 destY = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector256 destCb = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector256 destCr = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector256 destK = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + Vector256 maxSourceValue = Vector256.Create(255F); + Vector256 maxSampleValue = Vector256.Create(this.MaximumValue); + Vector256 chromaOffset = Vector256.Create(this.HalfValue); + + Vector256 f0299 = Vector256.Create(0.299f); + Vector256 f0587 = Vector256.Create(0.587f); + Vector256 f0114 = Vector256.Create(0.114f); + Vector256 fn0168736 = Vector256.Create(-0.168736f); + Vector256 fn0331264 = Vector256.Create(-0.331264f); + Vector256 fn0418688 = Vector256.Create(-0.418688f); + Vector256 fn0081312F = Vector256.Create(-0.081312F); + Vector256 f05 = Vector256.Create(0.5f); + + nuint n = values.Component0.Vector256Count(); + for (nuint i = 0; i < n; i++) + { + Vector256 r = Unsafe.Add(ref srcR, i) / maxSourceValue; + Vector256 g = Unsafe.Add(ref srcG, i) / maxSourceValue; + Vector256 b = Unsafe.Add(ref srcB, i) / maxSourceValue; + Vector256 ktmp = Vector256.One - Vector256.Max(r, Vector256.Min(g, b)); + + Vector256 kMask = ~Vector256.Equals(ktmp, Vector256.One); + Vector256 divisor = Vector256.One / (Vector256.One - ktmp); + + r = (r * divisor) & kMask; + g = (g * divisor) & kMask; + b = (b * divisor) & kMask; + + // y = 0 + (0.299 * r) + (0.587 * g) + (0.114 * b) + // cb = 128 - (0.168736 * r) - (0.331264 * g) + (0.5 * b) + // cr = 128 + (0.5 * r) - (0.418688 * g) - (0.081312 * b) + Vector256 y = Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(f0114 * b, f0587, g), f0299, r); + Vector256 cb = chromaOffset + Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(f05 * b, fn0331264, g), fn0168736, r); + Vector256 cr = chromaOffset + Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(fn0081312F * b, fn0418688, g), f05, r); + + Unsafe.Add(ref destY, i) = y * maxSampleValue; + Unsafe.Add(ref destCb, i) = chromaOffset + (cb * maxSampleValue); + Unsafe.Add(ref destCr, i) = chromaOffset + (cr * maxSampleValue); + Unsafe.Add(ref destK, i) = ktmp * maxSampleValue; + } + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector512.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector512.cs new file mode 100644 index 000000000..47168a739 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector512.cs @@ -0,0 +1,142 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using SixLabors.ImageSharp.Common.Helpers; +using SixLabors.ImageSharp.Metadata.Profiles.Icc; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components; + +internal abstract partial class JpegColorConverterBase +{ + internal sealed class TiffYccKVector512 : JpegColorConverterVector512 + { + public TiffYccKVector512(int precision) + : base(JpegColorSpace.TiffYccK, precision) + { + } + + /// + public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile) + => TiffYccKScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue); + + /// + protected override void ConvertToRgbInPlaceVectorized(in ComponentValues values) + { + ref Vector512 c0Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector512 c1Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector512 c2Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector512 c3Base = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + Vector512 scale = Vector512.Create(1F / this.MaximumValue); + Vector512 chromaOffset = Vector512.Create(this.HalfValue) * scale; + Vector512 rCrMult = Vector512.Create(YCbCrScalar.RCrMult); + Vector512 gCbMult = Vector512.Create(-YCbCrScalar.GCbMult); + Vector512 gCrMult = Vector512.Create(-YCbCrScalar.GCrMult); + Vector512 bCbMult = Vector512.Create(YCbCrScalar.BCbMult); + + nuint n = values.Component0.Vector512Count(); + for (nuint i = 0; i < n; i++) + { + ref Vector512 c0 = ref Unsafe.Add(ref c0Base, i); + ref Vector512 c1 = ref Unsafe.Add(ref c1Base, i); + ref Vector512 c2 = ref Unsafe.Add(ref c2Base, i); + ref Vector512 c3 = ref Unsafe.Add(ref c3Base, i); + + Vector512 y = c0 * scale; + Vector512 cb = (c1 * scale) - chromaOffset; + Vector512 cr = (c2 * scale) - chromaOffset; + Vector512 scaledK = Vector512.One - (c3 * scale); + + // r = y + (1.402F * cr); + // g = y - (0.344136F * cb) - (0.714136F * cr); + // b = y + (1.772F * cb); + Vector512 r = Vector512_.MultiplyAdd(y, cr, rCrMult) * scaledK; + Vector512 g = Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(y, cb, gCbMult), cr, gCrMult) * scaledK; + Vector512 b = Vector512_.MultiplyAdd(y, cb, bCbMult) * scaledK; + + c0 = r; + c1 = g; + c2 = b; + } + } + + /// + protected override void ConvertFromRgbVectorized(in ComponentValues values, Span rLane, Span gLane, Span bLane) + => ConvertFromRgbVectorized(in values, this.MaximumValue, this.HalfValue, rLane, gLane, bLane); + + /// + protected override void ConvertToRgbInPlaceScalarRemainder(in ComponentValues values) + => TiffYccKScalar.ConvertToRgbInPlace(values, this.MaximumValue, this.HalfValue); + + /// + protected override void ConvertFromRgbScalarRemainder(in ComponentValues values, Span rLane, Span gLane, Span bLane) + => TiffYccKScalar.ConvertFromRgb(values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane); + + internal static void ConvertFromRgbVectorized(in ComponentValues values, float maxValue, float halfValue, Span rLane, Span gLane, Span bLane) + { + ref Vector512 srcR = + ref Unsafe.As>(ref MemoryMarshal.GetReference(rLane)); + ref Vector512 srcG = + ref Unsafe.As>(ref MemoryMarshal.GetReference(gLane)); + ref Vector512 srcB = + ref Unsafe.As>(ref MemoryMarshal.GetReference(bLane)); + + ref Vector512 destY = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component0)); + ref Vector512 destCb = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component1)); + ref Vector512 destCr = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component2)); + ref Vector512 destK = + ref Unsafe.As>(ref MemoryMarshal.GetReference(values.Component3)); + + Vector512 maxSourceValue = Vector512.Create(255F); + Vector512 maxSampleValue = Vector512.Create(maxValue); + Vector512 chromaOffset = Vector512.Create(halfValue); + + Vector512 f0299 = Vector512.Create(0.299f); + Vector512 f0587 = Vector512.Create(0.587f); + Vector512 f0114 = Vector512.Create(0.114f); + Vector512 fn0168736 = Vector512.Create(-0.168736f); + Vector512 fn0331264 = Vector512.Create(-0.331264f); + Vector512 fn0418688 = Vector512.Create(-0.418688f); + Vector512 fn0081312F = Vector512.Create(-0.081312F); + Vector512 f05 = Vector512.Create(0.5f); + + nuint n = values.Component0.Vector512Count(); + for (nuint i = 0; i < n; i++) + { + Vector512 r = Unsafe.Add(ref srcR, i) / maxSourceValue; + Vector512 g = Unsafe.Add(ref srcG, i) / maxSourceValue; + Vector512 b = Unsafe.Add(ref srcB, i) / maxSourceValue; + Vector512 ktmp = Vector512.One - Vector512.Max(r, Vector512.Min(g, b)); + + Vector512 kMask = ~Vector512.Equals(ktmp, Vector512.One); + Vector512 divisor = Vector512.One / (Vector512.One - ktmp); + + r = (r * divisor) & kMask; + g = (g * divisor) & kMask; + b = (b * divisor) & kMask; + + // y = 0 + (0.299 * r) + (0.587 * g) + (0.114 * b) + // cb = 128 - (0.168736 * r) - (0.331264 * g) + (0.5 * b) + // cr = 128 + (0.5 * r) - (0.418688 * g) - (0.081312 * b) + Vector512 y = Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(f0114 * b, f0587, g), f0299, r); + Vector512 cb = chromaOffset + Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(f05 * b, fn0331264, g), fn0168736, r); + Vector512 cr = chromaOffset + Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(fn0081312F * b, fn0418688, g), f05, r); + + Unsafe.Add(ref destY, i) = y * maxSampleValue; + Unsafe.Add(ref destCb, i) = chromaOffset + (cb * maxSampleValue); + Unsafe.Add(ref destCr, i) = chromaOffset + (cr * maxSampleValue); + Unsafe.Add(ref destK, i) = ktmp * maxSampleValue; + } + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKScalar.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKScalar.cs index 136a9bf90..8cd715eb3 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKScalar.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKScalar.cs @@ -14,7 +14,7 @@ internal abstract partial class JpegColorConverterBase { internal sealed class YccKScalar : JpegColorConverterScalar { - // derived from ITU-T Rec. T.871 + // Derived from ITU-T Rec. T.871 internal const float RCrMult = 1.402f; internal const float GCbMult = (float)(0.114 * 1.772 / 0.587); internal const float GCrMult = (float)(0.299 * 1.402 / 0.587); @@ -27,7 +27,7 @@ internal abstract partial class JpegColorConverterBase /// public override void ConvertToRgbInPlace(in ComponentValues values) - => ConvertToRgpInPlace(values, this.MaximumValue, this.HalfValue); + => ConvertToRgbInPlace(values, this.MaximumValue, this.HalfValue); /// public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile) @@ -37,7 +37,7 @@ internal abstract partial class JpegColorConverterBase public override void ConvertFromRgb(in ComponentValues values, Span rLane, Span gLane, Span bLane) => ConvertFromRgb(values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane); - public static void ConvertToRgpInPlace(in ComponentValues values, float maxValue, float halfValue) + public static void ConvertToRgbInPlace(in ComponentValues values, float maxValue, float halfValue) { Span c0 = values.Component0; Span c1 = values.Component1; diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs index 213ea34cc..b81a833cd 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs @@ -82,7 +82,7 @@ internal abstract partial class JpegColorConverterBase /// protected override void ConvertToRgbInPlaceScalarRemainder(in ComponentValues values) - => YccKScalar.ConvertToRgpInPlace(values, this.MaximumValue, this.HalfValue); + => YccKScalar.ConvertToRgbInPlace(values, this.MaximumValue, this.HalfValue); /// protected override void ConvertFromRgbVectorized(in ComponentValues values, Span rLane, Span gLane, Span bLane) @@ -138,12 +138,6 @@ internal abstract partial class JpegColorConverterBase /// protected override void ConvertFromRgbScalarRemainder(in ComponentValues values, Span rLane, Span gLane, Span bLane) - { - // rgb -> cmyk - CmykScalar.ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane); - - // cmyk -> ycck - YccKScalar.ConvertFromRgb(in values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane); - } + => YccKScalar.ConvertFromRgb(in values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane); } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverterBase.cs b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverterBase.cs index c7cc8e971..74227c7a6 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverterBase.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverterBase.cs @@ -153,6 +153,39 @@ internal abstract partial class JpegColorConverterBase } } + public static void PackedNormalizeInterleave4( + ReadOnlySpan xLane, + ReadOnlySpan yLane, + ReadOnlySpan zLane, + ReadOnlySpan wLane, + Span packed, + float maxValue) + { + DebugGuard.IsTrue(packed.Length % 4 == 0, "Packed length must be divisible by 4."); + DebugGuard.IsTrue(yLane.Length == xLane.Length, nameof(yLane), "Channels must be of same size!"); + DebugGuard.IsTrue(zLane.Length == xLane.Length, nameof(zLane), "Channels must be of same size!"); + DebugGuard.IsTrue(wLane.Length == xLane.Length, nameof(wLane), "Channels must be of same size!"); + DebugGuard.MustBeLessThanOrEqualTo(packed.Length / 4, xLane.Length, nameof(packed)); + + float scale = 1F / maxValue; + + // TODO: Investigate SIMD version of this. + ref float xLaneRef = ref MemoryMarshal.GetReference(xLane); + ref float yLaneRef = ref MemoryMarshal.GetReference(yLane); + ref float zLaneRef = ref MemoryMarshal.GetReference(zLane); + ref float wLaneRef = ref MemoryMarshal.GetReference(wLane); + ref float packedRef = ref MemoryMarshal.GetReference(packed); + + for (nuint i = 0; i < (nuint)xLane.Length; i++) + { + nuint baseIdx = i * 4; + Unsafe.Add(ref packedRef, baseIdx) = Unsafe.Add(ref xLaneRef, i) * scale; + Unsafe.Add(ref packedRef, baseIdx + 1) = Unsafe.Add(ref yLaneRef, i) * scale; + Unsafe.Add(ref packedRef, baseIdx + 2) = Unsafe.Add(ref zLaneRef, i) * scale; + Unsafe.Add(ref packedRef, baseIdx + 3) = Unsafe.Add(ref wLaneRef, i) * scale; + } + } + public static void PackedInvertNormalizeInterleave4( ReadOnlySpan xLane, ReadOnlySpan yLane, @@ -198,6 +231,8 @@ internal abstract partial class JpegColorConverterBase GetCmykConverter(8), GetGrayScaleConverter(8), GetRgbConverter(8), + GetTiffCmykConverter(8), + GetTiffYccKConverter(8), // 12-bit converters GetYCbCrConverter(12), @@ -205,6 +240,8 @@ internal abstract partial class JpegColorConverterBase GetCmykConverter(12), GetGrayScaleConverter(12), GetRgbConverter(12), + GetTiffCmykConverter(12), + GetTiffYccKConverter(12), ]; /// @@ -327,6 +364,46 @@ internal abstract partial class JpegColorConverterBase return new RgbScalar(precision); } + private static JpegColorConverterBase GetTiffCmykConverter(int precision) + { + if (JpegColorConverterVector512.IsSupported) + { + return new TiffCmykVector512(precision); + } + + if (JpegColorConverterVector256.IsSupported) + { + return new TiffCmykVector256(precision); + } + + if (JpegColorConverterVector128.IsSupported) + { + return new TiffCmykVector128(precision); + } + + return new TiffCmykScalar(precision); + } + + private static JpegColorConverterBase GetTiffYccKConverter(int precision) + { + if (JpegColorConverterVector512.IsSupported) + { + return new TiffYccKVector512(precision); + } + + if (JpegColorConverterVector256.IsSupported) + { + return new TiffYccKVector256(precision); + } + + if (JpegColorConverterVector128.IsSupported) + { + return new TiffYccKVector128(precision); + } + + return new TiffYccKScalar(precision); + } + /// /// A stack-only struct to reference the input buffers using -s. /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs index e65296199..0703e4d9e 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs @@ -83,7 +83,8 @@ internal abstract class SpectralConverter /// The jpeg frame with the color space to convert to. /// The raw JPEG data. /// The color converter. - protected virtual JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverterBase.GetConverter(jpegData.ColorSpace, frame.Precision); + protected virtual JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData) + => JpegColorConverterBase.GetConverter(jpegData.ColorSpace, frame.Precision); /// /// Calculates image size with optional scaling. diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/SpectralConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/SpectralConverter{TPixel}.cs index fc93db9bb..baaa7213a 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/SpectralConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/SpectralConverter{TPixel}.cs @@ -2,7 +2,6 @@ // Licensed under the Six Labors Split License. using System.Buffers; -using SixLabors.ImageSharp.Advanced; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -109,6 +108,8 @@ internal class SpectralConverter : SpectralConverter, IDisposable int y = yy - this.pixelRowCounter; // Unpack TPixel to r/g/b planes + // TODO: The individual implementation code would be much easier here if + // we scaled to [0-1] before passing to the individual converters. int srcIndex = Math.Min(yy, pixelBufferLastVerticalIndex); Span sourceRow = this.pixelBuffer.DangerousGetRowSpan(srcIndex); PixelOperations.Instance.UnpackIntoRgbPlanes(rLane, gLane, bLane, sourceRow); diff --git a/src/ImageSharp/Formats/Jpeg/Components/JpegColorSpace.cs b/src/ImageSharp/Formats/Jpeg/Components/JpegColorSpace.cs index a2ec0666b..c7b9745fd 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/JpegColorSpace.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/JpegColorSpace.cs @@ -23,6 +23,16 @@ internal enum JpegColorSpace /// Cmyk, + /// + /// YccK color space with 4 components, used with tiff images, which use jpeg compression. + /// + TiffYccK, + + /// + /// Cmyk color space with 4 components, used with tiff images, which use jpeg compression. + /// + TiffCmyk, + /// /// Color space with 3 components. /// diff --git a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs index 9198a5239..0b2d3d67e 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs @@ -115,12 +115,14 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData /// Initializes a new instance of the class. /// /// The decoder options. - public JpegDecoderCore(JpegDecoderOptions options) + /// The ICC profile to use for color conversion. + public JpegDecoderCore(JpegDecoderOptions options, IccProfile iccProfile = null) : base(options.GeneralOptions) { this.resizeMode = options.ResizeMode; this.configuration = options.GeneralOptions.Configuration; this.skipMetadata = options.GeneralOptions.SkipMetadata; + this.SetIccMetadata(iccProfile); } /// @@ -231,7 +233,7 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData /// The scan decoder. public void LoadTables(byte[] tableBytes, IJpegScanDecoder scanDecoder) { - this.Metadata = new ImageMetadata(); + this.Metadata ??= new ImageMetadata(); this.QuantizationTables = new Block8x8F[4]; this.scanDecoder = scanDecoder; if (tableBytes.Length < 4) @@ -314,7 +316,7 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData this.scanDecoder ??= new HuffmanScanDecoder(stream, spectralConverter, cancellationToken); - this.Metadata = new ImageMetadata(); + this.Metadata ??= new ImageMetadata(); Span markerBuffer = stackalloc byte[2]; @@ -678,6 +680,16 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData } } + private void SetIccMetadata(IccProfile profile) + { + if (!this.skipMetadata && profile?.CheckIsValid() == true) + { + this.hasIcc = true; + this.Metadata ??= new ImageMetadata(); + this.Metadata.IccProfile = profile; + } + } + /// /// Initializes the IPTC profile. /// diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs index 1df55b8b5..8bdbea616 100644 --- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs +++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs @@ -6,6 +6,7 @@ using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder; using SixLabors.ImageSharp.Formats.Tiff.Constants; using SixLabors.ImageSharp.IO; using SixLabors.ImageSharp.Memory; +using SixLabors.ImageSharp.Metadata; using SixLabors.ImageSharp.Metadata.Profiles.Icc; using SixLabors.ImageSharp.PixelFormats; @@ -22,6 +23,8 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor private readonly TiffPhotometricInterpretation photometricInterpretation; + private readonly ImageFrameMetadata metadata; + /// /// Initializes a new instance of the class. /// @@ -29,6 +32,7 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor /// The memoryAllocator to use for buffer allocations. /// The image width. /// The bits per pixel. + /// The image frame metadata. /// The JPEG tables containing the quantization and/or Huffman tables. /// The photometric interpretation. public JpegTiffCompression( @@ -36,11 +40,13 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor MemoryAllocator memoryAllocator, int width, int bitsPerPixel, + ImageFrameMetadata metadata, byte[] jpegTables, TiffPhotometricInterpretation photometricInterpretation) : base(memoryAllocator, width, bitsPerPixel) { this.options = options; + this.metadata = metadata; this.jpegTables = jpegTables; this.photometricInterpretation = photometricInterpretation; } @@ -61,7 +67,7 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor private void DecodeJpegData(BufferedReadStream stream, Span buffer, CancellationToken cancellationToken) { - using JpegDecoderCore jpegDecoder = new(this.options); + using JpegDecoderCore jpegDecoder = new(this.options, this.metadata.IccProfile); Configuration configuration = this.options.GeneralOptions.Configuration; switch (this.photometricInterpretation) { @@ -85,6 +91,7 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor case TiffPhotometricInterpretation.YCbCr: case TiffPhotometricInterpretation.Rgb: + case TiffPhotometricInterpretation.Separated: { using SpectralConverter spectralConverter = new TiffJpegSpectralConverter(configuration, this.photometricInterpretation); HuffmanScanDecoder scanDecoder = new(stream, spectralConverter, cancellationToken); diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/OldJpegTiffCompression.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/OldJpegTiffCompression.cs index 13257dd63..c07b93af8 100644 --- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/OldJpegTiffCompression.cs +++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/OldJpegTiffCompression.cs @@ -6,6 +6,7 @@ using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder; using SixLabors.ImageSharp.Formats.Tiff.Constants; using SixLabors.ImageSharp.IO; using SixLabors.ImageSharp.Memory; +using SixLabors.ImageSharp.Metadata; using SixLabors.ImageSharp.Metadata.Profiles.Icc; using SixLabors.ImageSharp.PixelFormats; @@ -17,6 +18,8 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor private readonly uint startOfImageMarker; + private readonly ImageFrameMetadata metadata; + private readonly TiffPhotometricInterpretation photometricInterpretation; public OldJpegTiffCompression( @@ -24,12 +27,14 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor MemoryAllocator memoryAllocator, int width, int bitsPerPixel, + ImageFrameMetadata metadata, uint startOfImageMarker, TiffPhotometricInterpretation photometricInterpretation) : base(memoryAllocator, width, bitsPerPixel) { this.options = options; this.startOfImageMarker = startOfImageMarker; + this.metadata = metadata; this.photometricInterpretation = photometricInterpretation; } @@ -47,7 +52,7 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor private void DecodeJpegData(BufferedReadStream stream, Span buffer, CancellationToken cancellationToken) { - using JpegDecoderCore jpegDecoder = new(this.options); + using JpegDecoderCore jpegDecoder = new(this.options, this.metadata.IccProfile); Configuration configuration = this.options.GeneralOptions.Configuration; switch (this.photometricInterpretation) { @@ -71,6 +76,7 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor case TiffPhotometricInterpretation.YCbCr: case TiffPhotometricInterpretation.Rgb: + case TiffPhotometricInterpretation.Separated: { using SpectralConverter spectralConverter = new TiffOldJpegSpectralConverter(configuration, this.photometricInterpretation); diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffJpegSpectralConverter{TPixel}.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffJpegSpectralConverter{TPixel}.cs index f051aaea1..c32d1ea6b 100644 --- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffJpegSpectralConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffJpegSpectralConverter{TPixel}.cs @@ -31,19 +31,30 @@ internal sealed class TiffJpegSpectralConverter : SpectralConverter protected override JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData) { - JpegColorSpace colorSpace = GetJpegColorSpaceFromPhotometricInterpretation(this.photometricInterpretation); + JpegColorSpace colorSpace = GetJpegColorSpace(this.photometricInterpretation, jpegData); return JpegColorConverterBase.GetConverter(colorSpace, frame.Precision); } /// - /// This converter must be used only for RGB and YCbCr color spaces for performance reasons. + /// Photometric interpretation Rgb and YCbCr will be mapped to RGB colorspace, which means the jpeg decompression will leave the data as is (no color conversion). + /// The color conversion will be done after the decompression. For Separated/CMYK/YCCK, the jpeg color converter will handle the color conversion, + /// since the jpeg color converter needs to return RGB data and cannot return 4 component data. /// For grayscale images must be used. /// - private static JpegColorSpace GetJpegColorSpaceFromPhotometricInterpretation(TiffPhotometricInterpretation interpretation) - => interpretation switch - { - TiffPhotometricInterpretation.Rgb => JpegColorSpace.RGB, - TiffPhotometricInterpretation.YCbCr => JpegColorSpace.RGB, - _ => throw new InvalidImageContentException($"Invalid tiff photometric interpretation for jpeg encoding: {interpretation}"), - }; + /// + /// The to convert to a . + /// + /// + /// The containing the color space information. + /// + /// + /// Thrown when the is not supported for JPEG encoding. + /// + private static JpegColorSpace GetJpegColorSpace(TiffPhotometricInterpretation interpretation, IRawJpegData data) => interpretation switch + { + TiffPhotometricInterpretation.Rgb => JpegColorSpace.RGB, + TiffPhotometricInterpretation.Separated => data.ColorSpace == JpegColorSpace.Ycck ? JpegColorSpace.TiffYccK : JpegColorSpace.TiffCmyk, + TiffPhotometricInterpretation.YCbCr => JpegColorSpace.RGB, // TODO: Why doesn't this use the YCbCr color space? + _ => throw new InvalidImageContentException($"Invalid TIFF photometric interpretation for JPEG encoding: {interpretation}"), + }; } diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffOldJpegSpectralConverter{TPixel}.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffOldJpegSpectralConverter{TPixel}.cs index 457c8d79c..1e97527bc 100644 --- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffOldJpegSpectralConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffOldJpegSpectralConverter{TPixel}.cs @@ -30,15 +30,16 @@ internal sealed class TiffOldJpegSpectralConverter : SpectralConverter protected override JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData) { - JpegColorSpace colorSpace = GetJpegColorSpaceFromPhotometricInterpretation(this.photometricInterpretation); + JpegColorSpace colorSpace = GetJpegColorSpaceFromPhotometricInterpretation(this.photometricInterpretation, jpegData); return JpegColorConverterBase.GetConverter(colorSpace, frame.Precision); } - private static JpegColorSpace GetJpegColorSpaceFromPhotometricInterpretation(TiffPhotometricInterpretation interpretation) + private static JpegColorSpace GetJpegColorSpaceFromPhotometricInterpretation(TiffPhotometricInterpretation interpretation, IRawJpegData data) => interpretation switch { // Like libtiff: Always treat the pixel data as YCbCr when the data is compressed with old jpeg compression. TiffPhotometricInterpretation.Rgb => JpegColorSpace.YCbCr, + TiffPhotometricInterpretation.Separated => data.ColorSpace == JpegColorSpace.Ycck ? JpegColorSpace.TiffYccK : JpegColorSpace.TiffCmyk, TiffPhotometricInterpretation.YCbCr => JpegColorSpace.YCbCr, _ => throw new InvalidImageContentException($"Invalid tiff photometric interpretation for jpeg encoding: {interpretation}"), }; diff --git a/src/ImageSharp/Formats/Tiff/Compression/TiffDecompressorsFactory.cs b/src/ImageSharp/Formats/Tiff/Compression/TiffDecompressorsFactory.cs index 3e1df261b..0bc2e7343 100644 --- a/src/ImageSharp/Formats/Tiff/Compression/TiffDecompressorsFactory.cs +++ b/src/ImageSharp/Formats/Tiff/Compression/TiffDecompressorsFactory.cs @@ -5,6 +5,7 @@ using SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors; using SixLabors.ImageSharp.Formats.Tiff.Constants; using SixLabors.ImageSharp.Formats.Tiff.PhotometricInterpretation; using SixLabors.ImageSharp.Memory; +using SixLabors.ImageSharp.Metadata; namespace SixLabors.ImageSharp.Formats.Tiff.Compression; @@ -17,6 +18,7 @@ internal static class TiffDecompressorsFactory TiffPhotometricInterpretation photometricInterpretation, int width, int bitsPerPixel, + ImageFrameMetadata metadata, TiffColorType colorType, TiffPredictor predictor, FaxCompressionOptions faxOptions, @@ -62,11 +64,11 @@ internal static class TiffDecompressorsFactory case TiffDecoderCompressionType.Jpeg: DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); - return new JpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, jpegTables, photometricInterpretation); + return new JpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, metadata, jpegTables, photometricInterpretation); case TiffDecoderCompressionType.OldJpeg: DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); - return new OldJpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, oldJpegStartOfImageMarker, photometricInterpretation); + return new OldJpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, metadata, oldJpegStartOfImageMarker, photometricInterpretation); case TiffDecoderCompressionType.Webp: DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); diff --git a/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/CmykTiffColor{TPixel}.cs b/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/CmykTiffColor{TPixel}.cs index c7fe2ed07..b0580ead3 100644 --- a/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/CmykTiffColor{TPixel}.cs +++ b/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/CmykTiffColor{TPixel}.cs @@ -1,7 +1,9 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. +using System.Numerics; using SixLabors.ImageSharp.ColorProfiles; +using SixLabors.ImageSharp.Formats.Tiff.Compression; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -13,10 +15,31 @@ internal class CmykTiffColor : TiffBaseColorDecoder private static readonly ColorProfileConverter ColorProfileConverter = new(); private const float Inv255 = 1f / 255f; + private readonly TiffDecoderCompressionType compression; + + public CmykTiffColor(TiffDecoderCompressionType compression) => this.compression = compression; + /// public override void Decode(ReadOnlySpan data, Buffer2D pixels, int left, int top, int width, int height) { int offset = 0; + + if (this.compression == TiffDecoderCompressionType.Jpeg) + { + for (int y = top; y < top + height; y++) + { + Span pixelRow = pixels.DangerousGetRowSpan(y).Slice(left, width); + for (int x = 0; x < pixelRow.Length; x++) + { + pixelRow[x] = TPixel.FromVector4(new Vector4(data[offset] * Inv255, data[offset + 1] * Inv255, data[offset + 2] * Inv255, 1.0f)); + + offset += 3; + } + } + + return; + } + for (int y = top; y < top + height; y++) { Span pixelRow = pixels.DangerousGetRowSpan(y).Slice(left, width); diff --git a/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/TiffColorDecoderFactory{TPixel}.cs b/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/TiffColorDecoderFactory{TPixel}.cs index c59b08a55..e2eb82e3b 100644 --- a/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/TiffColorDecoderFactory{TPixel}.cs +++ b/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/TiffColorDecoderFactory{TPixel}.cs @@ -1,6 +1,7 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. +using SixLabors.ImageSharp.Formats.Tiff.Compression; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -19,6 +20,7 @@ internal static class TiffColorDecoderFactory Rational[] referenceBlackAndWhite, Rational[] ycbcrCoefficients, ushort[] ycbcrSubSampling, + TiffDecoderCompressionType compression, ByteOrder byteOrder) { switch (colorType) @@ -410,7 +412,7 @@ internal static class TiffColorDecoderFactory && bitsPerSample.Channel1 == 8 && bitsPerSample.Channel0 == 8, "bitsPerSample"); - return new CmykTiffColor(); + return new CmykTiffColor(compression); default: throw TiffThrowHelper.InvalidColorType(colorType.ToString()); diff --git a/src/ImageSharp/Formats/Tiff/TiffDecoderCore.cs b/src/ImageSharp/Formats/Tiff/TiffDecoderCore.cs index d8ebb1e9e..e594ee812 100644 --- a/src/ImageSharp/Formats/Tiff/TiffDecoderCore.cs +++ b/src/ImageSharp/Formats/Tiff/TiffDecoderCore.cs @@ -11,6 +11,7 @@ using SixLabors.ImageSharp.IO; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.Metadata; using SixLabors.ImageSharp.Metadata.Profiles.Exif; +using SixLabors.ImageSharp.Metadata.Profiles.Icc; using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Formats.Tiff; @@ -280,6 +281,12 @@ internal class TiffDecoderCore : ImageDecoderCore if (!this.skipMetadata) { imageFrameMetaData.ExifProfile = tags; + + // We resolve the ICC profile early so that we can use it for color conversion if needed. + if (tags.TryGetValue(ExifTag.IccProfile, out IExifValue iccProfileBytes)) + { + imageFrameMetaData.IccProfile = new IccProfile(iccProfileBytes.Value); + } } TiffFrameMetadata tiffMetadata = TiffFrameMetadata.Parse(tags); @@ -438,7 +445,7 @@ internal class TiffDecoderCore : ImageDecoderCore stripBuffers[stripIndex] = this.memoryAllocator.Allocate(uncompressedStripSize); } - using TiffBaseDecompressor decompressor = this.CreateDecompressor(width, bitsPerPixel); + using TiffBaseDecompressor decompressor = this.CreateDecompressor(width, bitsPerPixel, frame.Metadata); TiffBasePlanarColorDecoder colorDecoder = this.CreatePlanarColorDecoder(); for (int i = 0; i < stripsPerPlane; i++) @@ -507,7 +514,7 @@ internal class TiffDecoderCore : ImageDecoderCore Span stripBufferSpan = stripBuffer.GetSpan(); Buffer2D pixels = frame.PixelBuffer; - using TiffBaseDecompressor decompressor = this.CreateDecompressor(width, bitsPerPixel); + using TiffBaseDecompressor decompressor = this.CreateDecompressor(width, bitsPerPixel, frame.Metadata); TiffBaseColorDecoder colorDecoder = this.CreateChunkyColorDecoder(); for (int stripIndex = 0; stripIndex < stripOffsets.Length; stripIndex++) @@ -578,7 +585,7 @@ internal class TiffDecoderCore : ImageDecoderCore tilesBuffers[i] = this.memoryAllocator.Allocate(uncompressedTilesSize, AllocationOptions.Clean); } - using TiffBaseDecompressor decompressor = this.CreateDecompressor(frame.Width, bitsPerPixel); + using TiffBaseDecompressor decompressor = this.CreateDecompressor(frame.Width, bitsPerPixel, frame.Metadata); TiffBasePlanarColorDecoder colorDecoder = this.CreatePlanarColorDecoder(); int tileIndex = 0; @@ -679,7 +686,7 @@ internal class TiffDecoderCore : ImageDecoderCore using IMemoryOwner tileBuffer = this.memoryAllocator.Allocate(bytesPerTileRow * tileLength, AllocationOptions.Clean); Span tileBufferSpan = tileBuffer.GetSpan(); - using TiffBaseDecompressor decompressor = this.CreateDecompressor(frame.Width, bitsPerPixel, true, tileWidth, tileLength); + using TiffBaseDecompressor decompressor = this.CreateDecompressor(frame.Width, bitsPerPixel, frame.Metadata, true, tileWidth, tileLength); TiffBaseColorDecoder colorDecoder = this.CreateChunkyColorDecoder(); int tileIndex = 0; @@ -733,6 +740,7 @@ internal class TiffDecoderCore : ImageDecoderCore this.ReferenceBlackAndWhite, this.YcbcrCoefficients, this.YcbcrSubSampling, + this.CompressionType, this.byteOrder); private TiffBasePlanarColorDecoder CreatePlanarColorDecoder() @@ -747,7 +755,13 @@ internal class TiffDecoderCore : ImageDecoderCore this.YcbcrSubSampling, this.byteOrder); - private TiffBaseDecompressor CreateDecompressor(int frameWidth, int bitsPerPixel, bool isTiled = false, int tileWidth = 0, int tileHeight = 0) + private TiffBaseDecompressor CreateDecompressor( + int frameWidth, + int bitsPerPixel, + ImageFrameMetadata metadata, + bool isTiled = false, + int tileWidth = 0, + int tileHeight = 0) where TPixel : unmanaged, IPixel => TiffDecompressorsFactory.Create( this.Options, @@ -756,6 +770,7 @@ internal class TiffDecoderCore : ImageDecoderCore this.PhotometricInterpretation, frameWidth, bitsPerPixel, + metadata, this.ColorType, this.Predictor, this.FaxCompressionOptions, diff --git a/src/ImageSharp/Formats/Tiff/TiffDecoderMetadataCreator.cs b/src/ImageSharp/Formats/Tiff/TiffDecoderMetadataCreator.cs index 28565cac4..ebf407f9b 100644 --- a/src/ImageSharp/Formats/Tiff/TiffDecoderMetadataCreator.cs +++ b/src/ImageSharp/Formats/Tiff/TiffDecoderMetadataCreator.cs @@ -5,7 +5,6 @@ using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Metadata; using SixLabors.ImageSharp.Metadata.Profiles.Exif; -using SixLabors.ImageSharp.Metadata.Profiles.Icc; using SixLabors.ImageSharp.Metadata.Profiles.Iptc; using SixLabors.ImageSharp.Metadata.Profiles.Xmp; @@ -29,6 +28,8 @@ internal static class TiffDecoderMetadataCreator { for (int i = 0; i < frames.Count; i++) { + // ICC profile data has already been resolved in the frame metadata, + // as it is required for color conversion. ImageFrameMetadata frameMetaData = frames[i]; if (TryGetIptc(frameMetaData.ExifProfile.Values, out byte[] iptcBytes)) { @@ -39,11 +40,6 @@ internal static class TiffDecoderMetadataCreator { frameMetaData.XmpProfile = new XmpProfile(xmpProfileBytes.Value); } - - if (frameMetaData.ExifProfile.TryGetValue(ExifTag.IccProfile, out IExifValue iccProfileBytes)) - { - frameMetaData.IccProfile = new IccProfile(iccProfileBytes.Value); - } } } diff --git a/src/ImageSharp/Formats/Webp/AlphaDecoder.cs b/src/ImageSharp/Formats/Webp/AlphaDecoder.cs index 43dab1ffc..c7ce12fc7 100644 --- a/src/ImageSharp/Formats/Webp/AlphaDecoder.cs +++ b/src/ImageSharp/Formats/Webp/AlphaDecoder.cs @@ -6,7 +6,6 @@ using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Formats.Webp.BitReader; @@ -314,7 +313,7 @@ internal class AlphaDecoder : IDisposable private static void HorizontalUnfilter(Span prev, Span input, Span dst, int width) { - if ((Sse2.IsSupported || AdvSimd.IsSupported) && width >= 9) + if (Vector128.IsHardwareAccelerated && width >= 9) { dst[0] = (byte)(input[0] + (prev.IsEmpty ? 0 : prev[0])); nuint i; @@ -362,7 +361,7 @@ internal class AlphaDecoder : IDisposable { HorizontalUnfilter(null, input, dst, width); } - else if (Avx2.IsSupported) + else if (Vector256.IsHardwareAccelerated) { ref byte inputRef = ref MemoryMarshal.GetReference(input); ref byte prevRef = ref MemoryMarshal.GetReference(prev); @@ -374,7 +373,7 @@ internal class AlphaDecoder : IDisposable { Vector256 a0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, i)); Vector256 b0 = Unsafe.As>(ref Unsafe.Add(ref prevRef, i)); - Vector256 c0 = Avx2.Add(a0.AsByte(), b0.AsByte()); + Vector256 c0 = a0.AsByte() + b0.AsByte(); ref byte outputRef = ref Unsafe.Add(ref dstRef, i); Unsafe.As>(ref outputRef) = c0; } diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs index 9a6dfb66e..c701d56d3 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs @@ -4,7 +4,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Webp.Lossless; @@ -12,17 +12,20 @@ internal static class ColorSpaceTransformUtils { public static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) { - if (Avx2.IsSupported && tileWidth >= 16) + if (Vector256.IsHardwareAccelerated && tileWidth >= 16) { const int span = 16; Span values = stackalloc ushort[span]; - var collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255); - var collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30); - var collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - var collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - var collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); - var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); - var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); + + // These shuffle masks are safe for use with Avx2.Shuffle because all indices are within their respective 128-bit lanes (0–15 for the low mask, 16–31 for the high mask), + // and all disabled lanes are set to 0xFF to zero those bytes per the vpshufb specification. This guarantees lane-local shuffling with no cross-lane violations. + Vector256 collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255); + Vector256 collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30); + Vector256 collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + Vector256 collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + Vector256 collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + Vector256 multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); + Vector256 multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); for (int y = 0; y < tileHeight; y++) { Span srcSpan = bgra[(y * stride)..]; @@ -33,18 +36,18 @@ internal static class ColorSpaceTransformUtils nuint input1Idx = x + (span / 2); Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector256 r0 = Avx2.Shuffle(input0, collectColorBlueTransformsShuffleLowMask256); - Vector256 r1 = Avx2.Shuffle(input1, collectColorBlueTransformsShuffleHighMask256); - Vector256 r = Avx2.Or(r0, r1); - Vector256 gb0 = Avx2.And(input0, collectColorBlueTransformsGreenBlueMask256); - Vector256 gb1 = Avx2.And(input1, collectColorBlueTransformsGreenBlueMask256); - Vector256 gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); - Vector256 g = Avx2.And(gb.AsByte(), collectColorBlueTransformsGreenMask256); - Vector256 a = Avx2.MultiplyHigh(r.AsInt16(), multsr); - Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); - Vector256 c = Avx2.Subtract(gb.AsByte(), b.AsByte()); - Vector256 d = Avx2.Subtract(c, a.AsByte()); - Vector256 e = Avx2.And(d, collectColorBlueTransformsBlueMask256); + Vector256 r0 = Vector256_.ShufflePerLane(input0, collectColorBlueTransformsShuffleLowMask256); + Vector256 r1 = Vector256_.ShufflePerLane(input1, collectColorBlueTransformsShuffleHighMask256); + Vector256 r = r0 | r1; + Vector256 gb0 = input0 & collectColorBlueTransformsGreenBlueMask256; + Vector256 gb1 = input1 & collectColorBlueTransformsGreenBlueMask256; + Vector256 gb = Vector256_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector256 g = gb.AsByte() & collectColorBlueTransformsGreenMask256; + Vector256 a = Vector256_.MultiplyHigh(r.AsInt16(), multsr); + Vector256 b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); + Vector256 c = gb.AsByte() - b.AsByte(); + Vector256 d = c - a.AsByte(); + Vector256 e = d & collectColorBlueTransformsBlueMask256; ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = e.AsUInt16(); @@ -59,20 +62,20 @@ internal static class ColorSpaceTransformUtils int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); + CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); } } - else if (Sse41.IsSupported) + else if (Vector128.IsHardwareAccelerated) { const int span = 8; Span values = stackalloc ushort[span]; - var collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255); - var collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); - var collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - var collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - var collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); - var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue)); - var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue)); + Vector128 collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255); + Vector128 collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); + Vector128 collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + Vector128 collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + Vector128 collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + Vector128 multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue)); + Vector128 multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue)); for (int y = 0; y < tileHeight; y++) { Span srcSpan = bgra[(y * stride)..]; @@ -83,18 +86,18 @@ internal static class ColorSpaceTransformUtils nuint input1Idx = x + (span / 2); Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector128 r0 = Ssse3.Shuffle(input0, collectColorBlueTransformsShuffleLowMask); - Vector128 r1 = Ssse3.Shuffle(input1, collectColorBlueTransformsShuffleHighMask); - Vector128 r = Sse2.Or(r0, r1); - Vector128 gb0 = Sse2.And(input0, collectColorBlueTransformsGreenBlueMask); - Vector128 gb1 = Sse2.And(input1, collectColorBlueTransformsGreenBlueMask); - Vector128 gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); - Vector128 g = Sse2.And(gb.AsByte(), collectColorBlueTransformsGreenMask); - Vector128 a = Sse2.MultiplyHigh(r.AsInt16(), multsr); - Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); - Vector128 c = Sse2.Subtract(gb.AsByte(), b.AsByte()); - Vector128 d = Sse2.Subtract(c, a.AsByte()); - Vector128 e = Sse2.And(d, collectColorBlueTransformsBlueMask); + Vector128 r0 = Vector128_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask); + Vector128 r1 = Vector128_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask); + Vector128 r = r0 | r1; + Vector128 gb0 = input0 & collectColorBlueTransformsGreenBlueMask; + Vector128 gb1 = input1 & collectColorBlueTransformsGreenBlueMask; + Vector128 gb = Vector128_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector128 g = gb.AsByte() & collectColorBlueTransformsGreenMask; + Vector128 a = Vector128_.MultiplyHigh(r.AsInt16(), multsr); + Vector128 b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); + Vector128 c = gb.AsByte() - b.AsByte(); + Vector128 d = c - a.AsByte(); + Vector128 e = d & collectColorBlueTransformsBlueMask; ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = e.AsUInt16(); @@ -109,16 +112,16 @@ internal static class ColorSpaceTransformUtils int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); + CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); } } else { - CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo); + CollectColorBlueTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo); } } - private static void CollectColorBlueTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) + private static void CollectColorBlueTransformsScalar(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) { int pos = 0; while (tileHeight-- > 0) @@ -135,11 +138,11 @@ internal static class ColorSpaceTransformUtils public static void CollectColorRedTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) { - if (Avx2.IsSupported && tileWidth >= 16) + if (Vector256.IsHardwareAccelerated && tileWidth >= 16) { Vector256 collectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte(); Vector256 collectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte(); - var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed)); + Vector256 multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed)); const int span = 16; Span values = stackalloc ushort[span]; for (int y = 0; y < tileHeight; y++) @@ -152,15 +155,15 @@ internal static class ColorSpaceTransformUtils nuint input1Idx = x + (span / 2); Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector256 g0 = Avx2.And(input0, collectColorRedTransformsGreenMask256); // 0 0 | g 0 - Vector256 g1 = Avx2.And(input1, collectColorRedTransformsGreenMask256); - Vector256 g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 - Vector256 a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r - Vector256 a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16); - Vector256 a = Avx2.PackUnsignedSaturate(a0, a1); // x r - Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr - Vector256 c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r' - Vector256 d = Avx2.And(c, collectColorRedTransformsAndMask256); // 0 r' + Vector256 g0 = input0 & collectColorRedTransformsGreenMask256; // 0 0 | g 0 + Vector256 g1 = input1 & collectColorRedTransformsGreenMask256; + Vector256 g = Vector256_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 + Vector256 a0 = Vector256.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r + Vector256 a1 = Vector256.ShiftRightLogical(input1.AsInt32(), 16); + Vector256 a = Vector256_.PackUnsignedSaturate(a0, a1); // x r + Vector256 b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); // x dr + Vector256 c = a.AsByte() - b.AsByte(); // x r' + Vector256 d = c & collectColorRedTransformsAndMask256; // 0 r' ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = d.AsUInt16(); @@ -175,14 +178,14 @@ internal static class ColorSpaceTransformUtils int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); + CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); } } - else if (Sse41.IsSupported) + else if (Vector128.IsHardwareAccelerated) { Vector128 collectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte(); Vector128 collectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte(); - var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed)); + Vector128 multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed)); const int span = 8; Span values = stackalloc ushort[span]; for (int y = 0; y < tileHeight; y++) @@ -195,15 +198,15 @@ internal static class ColorSpaceTransformUtils nuint input1Idx = x + (span / 2); Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector128 g0 = Sse2.And(input0, collectColorRedTransformsGreenMask); // 0 0 | g 0 - Vector128 g1 = Sse2.And(input1, collectColorRedTransformsGreenMask); - Vector128 g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 - Vector128 a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r - Vector128 a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16); - Vector128 a = Sse41.PackUnsignedSaturate(a0, a1); // x r - Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr - Vector128 c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r' - Vector128 d = Sse2.And(c, collectColorRedTransformsAndMask); // 0 r' + Vector128 g0 = input0 & collectColorRedTransformsGreenMask; // 0 0 | g 0 + Vector128 g1 = input1 & collectColorRedTransformsGreenMask; + Vector128 g = Vector128_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 + Vector128 a0 = Vector128.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r + Vector128 a1 = Vector128.ShiftRightLogical(input1.AsInt32(), 16); + Vector128 a = Vector128_.PackUnsignedSaturate(a0, a1); // x r + Vector128 b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); // x dr + Vector128 c = a.AsByte() - b.AsByte(); // x r' + Vector128 d = c & collectColorRedTransformsAndMask; // 0 r' ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As>(ref outputRef) = d.AsUInt16(); @@ -218,16 +221,16 @@ internal static class ColorSpaceTransformUtils int leftOver = tileWidth & (span - 1); if (leftOver > 0) { - CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); + CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo); } } else { - CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo); + CollectColorRedTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToRed, histo); } } - private static void CollectColorRedTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) + private static void CollectColorRedTransformsScalar(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) { int pos = 0; while (tileHeight-- > 0) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 5287f0b75..e573097e5 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -6,6 +6,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Memory; namespace SixLabors.ImageSharp.Formats.Webp.Lossless; @@ -94,17 +95,20 @@ internal static unsafe class LosslessUtils /// The pixel data to apply the transformation. public static void AddGreenToBlueAndRed(Span pixelData) { - if (Avx2.IsSupported && pixelData.Length >= 8) + if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8) { - Vector256 addGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); + // The `255` values disable the write for alpha (A), since 0x80 is set in the control byte (high bit set). + // Each byte index is within its respective 128-bit lane (0–15 and 16–31), so this is safe for per-lane shuffle. + // The high bits are not set for the index bytes, and the values are always < 16 per lane, satisfying AVX2 lane rules. + Vector256 addGreenToBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); nuint numPixels = (uint)pixelData.Length; nuint i = 0; do { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); - Vector256 in0g0g = Avx2.Shuffle(input, addGreenToBlueAndRedMaskAvx2); - Vector256 output = Avx2.Add(input, in0g0g); + Vector256 in0g0g = Vector256_.ShufflePerLane(input, addGreenToBlueAndRedMask); + Vector256 output = input + in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 8; } @@ -115,39 +119,17 @@ internal static unsafe class LosslessUtils AddGreenToBlueAndRedScalar(pixelData[(int)i..]); } } - else if (Ssse3.IsSupported && pixelData.Length >= 4) - { - Vector128 addGreenToBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); - nuint numPixels = (uint)pixelData.Length; - nuint i = 0; - do - { - ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); - Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 in0g0g = Ssse3.Shuffle(input, addGreenToBlueAndRedMaskSsse3); - Vector128 output = Sse2.Add(input, in0g0g); - Unsafe.As>(ref pos) = output.AsUInt32(); - i += 4; - } - while (i <= numPixels - 4); - - if (i != numPixels) - { - AddGreenToBlueAndRedScalar(pixelData[(int)i..]); - } - } - else if (Sse2.IsSupported && pixelData.Length >= 4) + else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4) { + Vector128 addGreenToBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); nuint numPixels = (uint)pixelData.Length; nuint i = 0; do { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g - Vector128 b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g - Vector128 output = Sse2.Add(input.AsByte(), c.AsByte()); + Vector128 in0g0g = Vector128_.ShuffleNative(input, addGreenToBlueAndRedMask); + Vector128 output = input + in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 4; } @@ -180,17 +162,17 @@ internal static unsafe class LosslessUtils public static void SubtractGreenFromBlueAndRed(Span pixelData) { - if (Avx2.IsSupported && pixelData.Length >= 8) + if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8) { - Vector256 subtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); + Vector256 subtractGreenFromBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); nuint numPixels = (uint)pixelData.Length; nuint i = 0; do { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); - Vector256 in0g0g = Avx2.Shuffle(input, subtractGreenFromBlueAndRedMaskAvx2); - Vector256 output = Avx2.Subtract(input, in0g0g); + Vector256 in0g0g = Vector256_.ShufflePerLane(input, subtractGreenFromBlueAndRedMask); + Vector256 output = input - in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 8; } @@ -201,39 +183,17 @@ internal static unsafe class LosslessUtils SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]); } } - else if (Ssse3.IsSupported && pixelData.Length >= 4) - { - Vector128 subtractGreenFromBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); - nuint numPixels = (uint)pixelData.Length; - nuint i = 0; - do - { - ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); - Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 in0g0g = Ssse3.Shuffle(input, subtractGreenFromBlueAndRedMaskSsse3); - Vector128 output = Sse2.Subtract(input, in0g0g); - Unsafe.As>(ref pos) = output.AsUInt32(); - i += 4; - } - while (i <= numPixels - 4); - - if (i != numPixels) - { - SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]); - } - } - else if (Sse2.IsSupported && pixelData.Length >= 4) + else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4) { + Vector128 subtractGreenFromBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); nuint numPixels = (uint)pixelData.Length; nuint i = 0; do { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g - Vector128 b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g - Vector128 output = Sse2.Subtract(input.AsByte(), c.AsByte()); + Vector128 in0g0g = Vector128_.ShuffleNative(input, subtractGreenFromBlueAndRedMask); + Vector128 output = input - in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 4; } @@ -412,7 +372,7 @@ internal static unsafe class LosslessUtils TransformColorScalar(m, pixelData[(int)idx..], numPixels - (int)idx); } } - else if (Sse2.IsSupported && numPixels >= 4) + else if (Vector128.IsHardwareAccelerated && numPixels >= 4) { Vector128 transformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); Vector128 transformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); @@ -423,16 +383,16 @@ internal static unsafe class LosslessUtils { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector128 input = Unsafe.As>(ref pos); - Vector128 a = Sse2.And(input.AsByte(), transformColorAlphaGreenMask); - Vector128 b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector128 e = Sse2.ShiftLeftLogical(input.AsInt16(), 8); - Vector128 f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); - Vector128 g = Sse2.ShiftRightLogical(f.AsInt32(), 16); - Vector128 h = Sse2.Add(g.AsByte(), d.AsByte()); - Vector128 i = Sse2.And(h, transformColorRedBlueMask); - Vector128 output = Sse2.Subtract(input.AsByte(), i); + Vector128 a = input.AsByte() & transformColorAlphaGreenMask; + Vector128 b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector128 e = Vector128_.ShiftLeftLogical(input.AsInt16(), 8); + Vector128 f = Vector128_.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); + Vector128 g = Vector128.ShiftRightLogical(f.AsInt32(), 16); + Vector128 h = g.AsByte() + d.AsByte(); + Vector128 i = h & transformColorRedBlueMask; + Vector128 output = input.AsByte() - i; Unsafe.As>(ref pos) = output.AsUInt32(); idx += 4; } @@ -503,7 +463,7 @@ internal static unsafe class LosslessUtils TransformColorInverseScalar(m, pixelData[(int)idx..]); } } - else if (Sse2.IsSupported && pixelData.Length >= 4) + else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4) { Vector128 transformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); @@ -514,17 +474,17 @@ internal static unsafe class LosslessUtils { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector128 input = Unsafe.As>(ref pos); - Vector128 a = Sse2.And(input.AsByte(), transformColorInverseAlphaGreenMask); - Vector128 b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector128 e = Sse2.Add(input.AsByte(), d.AsByte()); - Vector128 f = Sse2.ShiftLeftLogical(e.AsInt16(), 8); - Vector128 g = Sse2.MultiplyHigh(f, multsb2.AsInt16()); - Vector128 h = Sse2.ShiftRightLogical(g.AsInt32(), 8); - Vector128 i = Sse2.Add(h.AsByte(), f.AsByte()); - Vector128 j = Sse2.ShiftRightLogical(i.AsInt16(), 8); - Vector128 output = Sse2.Or(j.AsByte(), a); + Vector128 a = input.AsByte() & transformColorInverseAlphaGreenMask; + Vector128 b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector128 e = input.AsByte() + d.AsByte(); + Vector128 f = Vector128_.ShiftLeftLogical(e.AsInt16(), 8); + Vector128 g = Vector128_.MultiplyHigh(f, multsb2.AsInt16()); + Vector128 h = Vector128.ShiftRightLogical(g.AsInt32(), 8); + Vector128 i = h.AsByte() + f.AsByte(); + Vector128 j = Vector128.ShiftRightLogical(i.AsInt16(), 8); + Vector128 output = j.AsByte() | a; Unsafe.As>(ref pos) = output.AsUInt32(); } @@ -1401,15 +1361,15 @@ internal static unsafe class LosslessUtils private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - Vector128 c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128.Zero); - Vector128 c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128.Zero); - Vector128 c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128.Zero); - Vector128 v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16()); - Vector128 v2 = Sse2.Subtract(v1, c2Vec.AsInt16()); - Vector128 b = Sse2.PackUnsignedSaturate(v2, v2); - return Sse2.ConvertToUInt32(b.AsUInt32()); + Vector128 c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128.Zero); + Vector128 c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128.Zero); + Vector128 c2Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128.Zero); + Vector128 v1 = c0Vec.AsInt16() + c1Vec.AsInt16(); + Vector128 v2 = v1 - c2Vec.AsInt16(); + Vector128 b = Vector128_.PackUnsignedSaturate(v2, v2); + return b.AsUInt32().ToScalar(); } { @@ -1432,20 +1392,20 @@ internal static unsafe class LosslessUtils private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - Vector128 c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128.Zero); - Vector128 c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128.Zero); - Vector128 b0 = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128.Zero); - Vector128 avg = Sse2.Add(c1Vec.AsInt16(), c0Vec.AsInt16()); - Vector128 a0 = Sse2.ShiftRightLogical(avg, 1); - Vector128 a1 = Sse2.Subtract(a0, b0.AsInt16()); - Vector128 bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16()); - Vector128 a2 = Sse2.Subtract(a1, bgta); - Vector128 a3 = Sse2.ShiftRightArithmetic(a2, 1); - Vector128 a4 = Sse2.Add(a0, a3).AsInt16(); - Vector128 a5 = Sse2.PackUnsignedSaturate(a4, a4); - return Sse2.ConvertToUInt32(a5.AsUInt32()); + Vector128 c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128.Zero); + Vector128 c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128.Zero); + Vector128 b0 = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128.Zero); + Vector128 avg = c1Vec.AsInt16() + c0Vec.AsInt16(); + Vector128 a0 = Vector128.ShiftRightLogical(avg, 1); + Vector128 a1 = a0 - b0.AsInt16(); + Vector128 bgta = Vector128.GreaterThan(b0.AsInt16(), a0.AsInt16()); + Vector128 a2 = a1 - bgta; + Vector128 a3 = Vector128.ShiftRightArithmetic(a2, 1); + Vector128 a4 = (a0 + a3).AsInt16(); + Vector128 a5 = Vector128_.PackUnsignedSaturate(a4, a4); + return a5.AsUInt32().ToScalar(); } { @@ -1475,23 +1435,23 @@ internal static unsafe class LosslessUtils private static uint Select(uint a, uint b, uint c, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { fixed (short* ptr = &MemoryMarshal.GetReference(scratch)) { - Vector128 a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte(); - Vector128 b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte(); - Vector128 c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte(); - Vector128 ac0 = Sse2.SubtractSaturate(a0, c0); - Vector128 ca0 = Sse2.SubtractSaturate(c0, a0); - Vector128 bc0 = Sse2.SubtractSaturate(b0, c0); - Vector128 cb0 = Sse2.SubtractSaturate(c0, b0); - Vector128 ac = Sse2.Or(ac0, ca0); - Vector128 bc = Sse2.Or(bc0, cb0); - Vector128 pa = Sse2.UnpackLow(ac, Vector128.Zero); // |a - c| - Vector128 pb = Sse2.UnpackLow(bc, Vector128.Zero); // |b - c| - Vector128 diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16()); - Sse2.Store((ushort*)ptr, diff); + Vector128 a0 = Vector128.CreateScalar(a).AsByte(); + Vector128 b0 = Vector128.CreateScalar(b).AsByte(); + Vector128 c0 = Vector128.CreateScalar(c).AsByte(); + Vector128 ac0 = Vector128_.SubtractSaturate(a0, c0); + Vector128 ca0 = Vector128_.SubtractSaturate(c0, a0); + Vector128 bc0 = Vector128_.SubtractSaturate(b0, c0); + Vector128 cb0 = Vector128_.SubtractSaturate(c0, b0); + Vector128 ac = ac0 | ca0; + Vector128 bc = bc0 | cb0; + Vector128 pa = Vector128_.UnpackLow(ac, Vector128.Zero); // |a - c| + Vector128 pb = Vector128_.UnpackLow(bc, Vector128.Zero); // |b - c| + Vector128 diff = pb.AsUInt16() - pa.AsUInt16(); + diff.Store((ushort*)ptr); int paMinusPb = ptr[3] + ptr[2] + ptr[1] + ptr[0]; return (paMinusPb <= 0) ? a : b; } diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index aae4181ce..c65861c4b 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -5,8 +5,7 @@ using System.Buffers.Binary; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.Arm; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Webp.Lossy; @@ -17,19 +16,14 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8_Sse16x16(Span a, Span b) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { - return Vp8_Sse16xN_Avx2(a, b, 4); + return Vp8_Sse16xN_Vector256(a, b, 4); } - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - return Vp8_Sse16xN_Sse2(a, b, 8); - } - - if (AdvSimd.IsSupported) - { - return Vp8_Sse16x16_Neon(a, b); + return Vp8_16xN_Vector128(a, b, 8); } return Vp8_SseNxN(a, b, 16, 16); @@ -39,19 +33,14 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8_Sse16x8(Span a, Span b) { - if (Avx2.IsSupported) - { - return Vp8_Sse16xN_Avx2(a, b, 2); - } - - if (Sse2.IsSupported) + if (Vector256.IsHardwareAccelerated) { - return Vp8_Sse16xN_Sse2(a, b, 4); + return Vp8_Sse16xN_Vector256(a, b, 2); } - if (AdvSimd.IsSupported) + if (Vector128.IsHardwareAccelerated) { - return Vp8_Sse16x8_Neon(a, b); + return Vp8_16xN_Vector128(a, b, 4); } return Vp8_SseNxN(a, b, 16, 8); @@ -61,40 +50,40 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8_Sse4x4(Span a, Span b) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { // Load values. ref byte aRef = ref MemoryMarshal.GetReference(a); ref byte bRef = ref MemoryMarshal.GetReference(b); - var a0 = Vector256.Create( + Vector256 a0 = Vector256.Create( Unsafe.As>(ref aRef), Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps))); - var a1 = Vector256.Create( + Vector256 a1 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2)), Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3))); - var b0 = Vector256.Create( + Vector256 b0 = Vector256.Create( Unsafe.As>(ref bRef), Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps))); - var b1 = Vector256.Create( + Vector256 b1 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2)), Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3))); // Combine pair of lines. - Vector256 a01 = Avx2.UnpackLow(a0.AsInt32(), a1.AsInt32()); - Vector256 b01 = Avx2.UnpackLow(b0.AsInt32(), b1.AsInt32()); + Vector256 a01 = Vector256_.UnpackLow(a0.AsInt32(), a1.AsInt32()); + Vector256 b01 = Vector256_.UnpackLow(b0.AsInt32(), b1.AsInt32()); // Convert to 16b. - Vector256 a01s = Avx2.UnpackLow(a01.AsByte(), Vector256.Zero); - Vector256 b01s = Avx2.UnpackLow(b01.AsByte(), Vector256.Zero); + Vector256 a01s = Vector256_.UnpackLow(a01.AsByte(), Vector256.Zero); + Vector256 b01s = Vector256_.UnpackLow(b01.AsByte(), Vector256.Zero); // subtract, square and accumulate. - Vector256 d0 = Avx2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); - Vector256 e0 = Avx2.MultiplyAddAdjacent(d0, d0); + Vector256 d0 = Vector256_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); + Vector256 e0 = Vector256_.MultiplyAddAdjacent(d0, d0); - return Numerics.ReduceSum(e0); + return ReduceSumVector256(e0); } - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load values. ref byte aRef = ref MemoryMarshal.GetReference(a); @@ -109,30 +98,25 @@ internal static class LossyUtils Vector128 b3 = Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3)); // Combine pair of lines. - Vector128 a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32()); - Vector128 a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32()); - Vector128 b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32()); - Vector128 b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32()); + Vector128 a01 = Vector128_.UnpackLow(a0.AsInt32(), a1.AsInt32()); + Vector128 a23 = Vector128_.UnpackLow(a2.AsInt32(), a3.AsInt32()); + Vector128 b01 = Vector128_.UnpackLow(b0.AsInt32(), b1.AsInt32()); + Vector128 b23 = Vector128_.UnpackLow(b2.AsInt32(), b3.AsInt32()); // Convert to 16b. - Vector128 a01s = Sse2.UnpackLow(a01.AsByte(), Vector128.Zero); - Vector128 a23s = Sse2.UnpackLow(a23.AsByte(), Vector128.Zero); - Vector128 b01s = Sse2.UnpackLow(b01.AsByte(), Vector128.Zero); - Vector128 b23s = Sse2.UnpackLow(b23.AsByte(), Vector128.Zero); + Vector128 a01s = Vector128_.UnpackLow(a01.AsByte(), Vector128.Zero); + Vector128 a23s = Vector128_.UnpackLow(a23.AsByte(), Vector128.Zero); + Vector128 b01s = Vector128_.UnpackLow(b01.AsByte(), Vector128.Zero); + Vector128 b23s = Vector128_.UnpackLow(b23.AsByte(), Vector128.Zero); // subtract, square and accumulate. - Vector128 d0 = Sse2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); - Vector128 d1 = Sse2.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16()); - Vector128 e0 = Sse2.MultiplyAddAdjacent(d0, d0); - Vector128 e1 = Sse2.MultiplyAddAdjacent(d1, d1); - Vector128 sum = Sse2.Add(e0, e1); + Vector128 d0 = Vector128_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); + Vector128 d1 = Vector128_.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16()); + Vector128 e0 = Vector128_.MultiplyAddAdjacent(d0, d0); + Vector128 e1 = Vector128_.MultiplyAddAdjacent(d1, d1); + Vector128 sum = e0 + e1; - return Numerics.ReduceSum(sum); - } - - if (AdvSimd.IsSupported) - { - return Vp8_Sse4x4_Neon(a, b); + return ReduceSumVector128(sum); } return Vp8_SseNxN(a, b, 4, 4); @@ -158,7 +142,7 @@ internal static class LossyUtils } [MethodImpl(InliningOptions.ShortMethod)] - private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs) + private static int Vp8_16xN_Vector128(Span a, Span b, int numPairs) { Vector128 sum = Vector128.Zero; nuint offset = 0; @@ -172,18 +156,18 @@ internal static class LossyUtils Vector128 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps)); Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps)); - Vector128 sum1 = SubtractAndAccumulate(a0, b0); - Vector128 sum2 = SubtractAndAccumulate(a1, b1); - sum = Sse2.Add(sum, Sse2.Add(sum1, sum2)); + Vector128 sum1 = SubtractAndAccumulateVector128(a0, b0); + Vector128 sum2 = SubtractAndAccumulateVector128(a1, b1); + sum += sum1 + sum2; offset += 2 * WebpConstants.Bps; } - return Numerics.ReduceSum(sum); + return ReduceSumVector128(sum); } [MethodImpl(InliningOptions.ShortMethod)] - private static int Vp8_Sse16xN_Avx2(Span a, Span b, int numPairs) + private static int Vp8_Sse16xN_Vector256(Span a, Span b, int numPairs) { Vector256 sum = Vector256.Zero; nuint offset = 0; @@ -192,154 +176,65 @@ internal static class LossyUtils for (int i = 0; i < numPairs; i++) { // Load values. - var a0 = Vector256.Create( + Vector256 a0 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref aRef, offset)), Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps))); - var b0 = Vector256.Create( + Vector256 b0 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref bRef, offset)), Unsafe.As>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps))); - var a1 = Vector256.Create( + Vector256 a1 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref aRef, offset + (2 * WebpConstants.Bps))), Unsafe.As>(ref Unsafe.Add(ref aRef, offset + (3 * WebpConstants.Bps)))); - var b1 = Vector256.Create( + Vector256 b1 = Vector256.Create( Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (2 * WebpConstants.Bps))), Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (3 * WebpConstants.Bps)))); - Vector256 sum1 = SubtractAndAccumulate(a0, b0); - Vector256 sum2 = SubtractAndAccumulate(a1, b1); - sum = Avx2.Add(sum, Avx2.Add(sum1, sum2)); + Vector256 sum1 = SubtractAndAccumulateVector256(a0, b0); + Vector256 sum2 = SubtractAndAccumulateVector256(a1, b1); + sum += sum1 + sum2; offset += 4 * WebpConstants.Bps; } - return Numerics.ReduceSum(sum); + return ReduceSumVector256(sum); } [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe int Vp8_Sse16x16_Neon(Span a, Span b) - { - Vector128 sum = Vector128.Zero; - fixed (byte* aRef = &MemoryMarshal.GetReference(a)) - { - fixed (byte* bRef = &MemoryMarshal.GetReference(b)) - { - for (int y = 0; y < 16; y++) - { - sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum); - } - } - } - - return (int)Vector128.Sum(sum); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe int Vp8_Sse16x8_Neon(Span a, Span b) - { - Vector128 sum = Vector128.Zero; - fixed (byte* aRef = &MemoryMarshal.GetReference(a)) - { - fixed (byte* bRef = &MemoryMarshal.GetReference(b)) - { - for (int y = 0; y < 8; y++) - { - sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum); - } - } - } - - return (int)Vector128.Sum(sum); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static int Vp8_Sse4x4_Neon(Span a, Span b) - { - Vector128 a0 = Load4x4Neon(a).AsByte(); - Vector128 b0 = Load4x4Neon(b).AsByte(); - Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0); - Vector64 absDiffLower = absDiff.GetLower().AsByte(); - Vector64 absDiffUpper = absDiff.GetUpper().AsByte(); - Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); - Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); - - // pair-wise adds and widen. - Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); - Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); - - Vector128 sum = AdvSimd.Add(sum1, sum2); - - return (int)Vector128.Sum(sum); - } - - // Load all 4x4 pixels into a single Vector128 - [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe Vector128 Load4x4Neon(Span src) - { - fixed (byte* srcRef = &MemoryMarshal.GetReference(src)) - { - Vector128 output = Vector128.Zero; - output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef); - output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps)); - output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2))); - output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3))); - return output; - } - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe Vector128 AccumulateSSE16Neon(byte* a, byte* b, Vector128 sum) - { - Vector128 a0 = AdvSimd.LoadVector128(a); - Vector128 b0 = AdvSimd.LoadVector128(b); - - Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0); - Vector64 absDiffLower = absDiff.GetLower(); - Vector64 absDiffUpper = absDiff.GetUpper(); - Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); - Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); - - // pair-wise adds and widen. - Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); - Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); - return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2)); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b) + private static Vector128 SubtractAndAccumulateVector128(Vector128 a, Vector128 b) { // Take abs(a-b) in 8b. - Vector128 ab = Sse2.SubtractSaturate(a, b); - Vector128 ba = Sse2.SubtractSaturate(b, a); - Vector128 absAb = Sse2.Or(ab, ba); + Vector128 ab = Vector128_.SubtractSaturate(a, b); + Vector128 ba = Vector128_.SubtractSaturate(b, a); + Vector128 absAb = ab | ba; // Zero-extend to 16b. - Vector128 c0 = Sse2.UnpackLow(absAb, Vector128.Zero); - Vector128 c1 = Sse2.UnpackHigh(absAb, Vector128.Zero); + Vector128 c0 = Vector128_.UnpackLow(absAb, Vector128.Zero); + Vector128 c1 = Vector128_.UnpackHigh(absAb, Vector128.Zero); // Multiply with self. - Vector128 sum1 = Sse2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); - Vector128 sum2 = Sse2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); + Vector128 sum1 = Vector128_.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); + Vector128 sum2 = Vector128_.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); - return Sse2.Add(sum1, sum2); + return sum1 + sum2; } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector256 SubtractAndAccumulate(Vector256 a, Vector256 b) + private static Vector256 SubtractAndAccumulateVector256(Vector256 a, Vector256 b) { // Take abs(a-b) in 8b. - Vector256 ab = Avx2.SubtractSaturate(a, b); - Vector256 ba = Avx2.SubtractSaturate(b, a); - Vector256 absAb = Avx2.Or(ab, ba); + Vector256 ab = Vector256_.SubtractSaturate(a, b); + Vector256 ba = Vector256_.SubtractSaturate(b, a); + Vector256 absAb = ab | ba; // Zero-extend to 16b. - Vector256 c0 = Avx2.UnpackLow(absAb, Vector256.Zero); - Vector256 c1 = Avx2.UnpackHigh(absAb, Vector256.Zero); + Vector256 c0 = Vector256_.UnpackLow(absAb, Vector256.Zero); + Vector256 c1 = Vector256_.UnpackHigh(absAb, Vector256.Zero); // Multiply with self. - Vector256 sum1 = Avx2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); - Vector256 sum2 = Avx2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); + Vector256 sum1 = Vector256_.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); + Vector256 sum2 = Vector256_.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); - return Avx2.Add(sum1, sum2); + return sum1 + sum2; } [MethodImpl(InliningOptions.ShortMethod)] @@ -378,17 +273,16 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8Disto4X4(Span a, Span b, Span w, Span scratch) { - if (Sse41.IsSupported) + if (Vector128.IsHardwareAccelerated) { - int diffSum = TTransformSse41(a, b, w); + int diffSum = TTransformVector128(a, b, w); return Math.Abs(diffSum) >> 5; } - else - { - int sum1 = TTransform(a, w, scratch); - int sum2 = TTransform(b, w, scratch); - return Math.Abs(sum2 - sum1) >> 5; - } + + int sum1 = TTransform(a, w, scratch); + int sum2 = TTransform(b, w, scratch); + + return Math.Abs(sum2 - sum1) >> 5; } public static void DC16(Span dst, Span yuv, int offset) @@ -905,7 +799,7 @@ internal static class LossyUtils /// Returns the weighted sum of the absolute value of transformed coefficients. /// w[] contains a row-major 4 by 4 symmetric matrix. /// - public static int TTransformSse41(Span inputA, Span inputB, Span w) + public static int TTransformVector128(Span inputA, Span inputB, Span w) { // Load and combine inputs. Vector128 ina0 = Unsafe.As>(ref MemoryMarshal.GetReference(inputA)); @@ -918,14 +812,14 @@ internal static class LossyUtils Vector128 inb3 = Unsafe.As>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 3, 16))).AsInt64(); // Combine inA and inB (we'll do two transforms in parallel). - Vector128 inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32()); - Vector128 inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32()); - Vector128 inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32()); - Vector128 inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32()); - Vector128 tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte()); - Vector128 tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte()); - Vector128 tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte()); - Vector128 tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte()); + Vector128 inab0 = Vector128_.UnpackLow(ina0.AsInt32(), inb0.AsInt32()); + Vector128 inab1 = Vector128_.UnpackLow(ina1.AsInt32(), inb1.AsInt32()); + Vector128 inab2 = Vector128_.UnpackLow(ina2.AsInt32(), inb2.AsInt32()); + Vector128 inab3 = Vector128_.UnpackLow(ina3.AsInt32(), inb3.AsInt32()); + Vector128 tmp0 = Vector128.WidenLower(inab0.AsByte()).AsInt16(); + Vector128 tmp1 = Vector128.WidenLower(inab1.AsByte()).AsInt16(); + Vector128 tmp2 = Vector128.WidenLower(inab2.AsByte()).AsInt16(); + Vector128 tmp3 = Vector128.WidenLower(inab3.AsByte()).AsInt16(); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 @@ -934,21 +828,21 @@ internal static class LossyUtils // Vertical pass first to avoid a transpose (vertical and horizontal passes // are commutative because w/kWeightY is symmetric) and subsequent transpose. // Calculate a and b (two 4x4 at once). - Vector128 a0 = Sse2.Add(tmp0, tmp2); - Vector128 a1 = Sse2.Add(tmp1, tmp3); - Vector128 a2 = Sse2.Subtract(tmp1, tmp3); - Vector128 a3 = Sse2.Subtract(tmp0, tmp2); - Vector128 b0 = Sse2.Add(a0, a1); - Vector128 b1 = Sse2.Add(a3, a2); - Vector128 b2 = Sse2.Subtract(a3, a2); - Vector128 b3 = Sse2.Subtract(a0, a1); + Vector128 a0 = tmp0 + tmp2; + Vector128 a1 = tmp1 + tmp3; + Vector128 a2 = tmp1 - tmp3; + Vector128 a3 = tmp0 - tmp2; + Vector128 b0 = a0 + a1; + Vector128 b1 = a3 + a2; + Vector128 b2 = a3 - a2; + Vector128 b3 = a0 - a1; // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3); + Vp8Transpose_2_4x4_16bVector128(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -959,71 +853,71 @@ internal static class LossyUtils Vector128 w8 = Unsafe.As>(ref MemoryMarshal.GetReference(w.Slice(8, 8))); // Calculate a and b (two 4x4 at once). - a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16()); - a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16()); - a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16()); - a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16()); - b0 = Sse2.Add(a0, a1); - b1 = Sse2.Add(a3, a2); - b2 = Sse2.Subtract(a3, a2); - b3 = Sse2.Subtract(a0, a1); + a0 = output0.AsInt16() + output2.AsInt16(); + a1 = output1.AsInt16() + output3.AsInt16(); + a2 = output1.AsInt16() - output3.AsInt16(); + a3 = output0.AsInt16() - output2.AsInt16(); + b0 = a0 + a1; + b1 = a3 + a2; + b2 = a3 - a2; + b3 = a0 - a1; // Separate the transforms of inA and inB. - Vector128 ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64()); - Vector128 ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64()); - Vector128 bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64()); - Vector128 bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64()); + Vector128 ab0 = Vector128_.UnpackLow(b0.AsInt64(), b1.AsInt64()); + Vector128 ab2 = Vector128_.UnpackLow(b2.AsInt64(), b3.AsInt64()); + Vector128 bb0 = Vector128_.UnpackHigh(b0.AsInt64(), b1.AsInt64()); + Vector128 bb2 = Vector128_.UnpackHigh(b2.AsInt64(), b3.AsInt64()); - Vector128 ab0Abs = Ssse3.Abs(ab0.AsInt16()); - Vector128 ab2Abs = Ssse3.Abs(ab2.AsInt16()); - Vector128 b0Abs = Ssse3.Abs(bb0.AsInt16()); - Vector128 bb2Abs = Ssse3.Abs(bb2.AsInt16()); + Vector128 ab0Abs = Vector128.Abs(ab0.AsInt16()); + Vector128 ab2Abs = Vector128.Abs(ab2.AsInt16()); + Vector128 b0Abs = Vector128.Abs(bb0.AsInt16()); + Vector128 bb2Abs = Vector128.Abs(bb2.AsInt16()); // weighted sums. - Vector128 ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16()); - Vector128 ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16()); - Vector128 b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16()); - Vector128 bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16()); - Vector128 ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8); - Vector128 b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8); + Vector128 ab0mulw0 = Vector128_.MultiplyAddAdjacent(ab0Abs, w0.AsInt16()); + Vector128 ab2mulw8 = Vector128_.MultiplyAddAdjacent(ab2Abs, w8.AsInt16()); + Vector128 b0mulw0 = Vector128_.MultiplyAddAdjacent(b0Abs, w0.AsInt16()); + Vector128 bb2mulw8 = Vector128_.MultiplyAddAdjacent(bb2Abs, w8.AsInt16()); + Vector128 ab0ab2Sum = ab0mulw0 + ab2mulw8; + Vector128 b0w0bb2w8Sum = b0mulw0 + bb2mulw8; // difference of weighted sums. - Vector128 result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32()); + Vector128 result = ab0ab2Sum - b0w0bb2w8Sum; - return Numerics.ReduceSum(result); + return ReduceSumVector128(result); } // Transpose two 4x4 16b matrices horizontally stored in registers. [MethodImpl(InliningOptions.ShortMethod)] - public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) + public static void Vp8Transpose_2_4x4_16bVector128(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) { // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 - Vector128 transpose00 = Sse2.UnpackLow(b0, b1); - Vector128 transpose01 = Sse2.UnpackLow(b2, b3); - Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); - Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); + Vector128 transpose00 = Vector128_.UnpackLow(b0, b1); + Vector128 transpose01 = Vector128_.UnpackLow(b2, b3); + Vector128 transpose02 = Vector128_.UnpackHigh(b0, b1); + Vector128 transpose03 = Vector128_.UnpackHigh(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 - Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); - Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); + Vector128 transpose10 = Vector128_.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose11 = Vector128_.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); + Vector128 transpose12 = Vector128_.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose13 = Vector128_.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 - output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); - output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); - output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); - output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + output0 = Vector128_.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); + output1 = Vector128_.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); + output2 = Vector128_.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); + output3 = Vector128_.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -1035,7 +929,7 @@ internal static class LossyUtils // Does two transforms. public static void TransformTwo(Span src, Span dst, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: @@ -1057,24 +951,24 @@ internal static class LossyUtils // Load and concatenate the transform coefficients (we'll do two transforms // in parallel). ref short srcRef = ref MemoryMarshal.GetReference(src); - var in0 = Vector128.Create(Unsafe.As(ref srcRef), 0); - var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 4)), 0); - var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 8)), 0); - var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 12)), 0); + Vector128 in0 = Vector128.Create(Unsafe.As(ref srcRef), 0); + Vector128 in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 4)), 0); + Vector128 in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 8)), 0); + Vector128 in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 12)), 0); // a00 a10 a20 a30 x x x x // a01 a11 a21 a31 x x x x // a02 a12 a22 a32 x x x x // a03 a13 a23 a33 x x x x - var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 16)), 0); - var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 20)), 0); - var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 24)), 0); - var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 28)), 0); + Vector128 inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 16)), 0); + Vector128 inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 20)), 0); + Vector128 inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 24)), 0); + Vector128 inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 28)), 0); - in0 = Sse2.UnpackLow(in0, inb0); - in1 = Sse2.UnpackLow(in1, inb1); - in2 = Sse2.UnpackLow(in2, inb2); - in3 = Sse2.UnpackLow(in3, inb3); + in0 = Vector128_.UnpackLow(in0, inb0); + in1 = Vector128_.UnpackLow(in1, inb1); + in2 = Vector128_.UnpackLow(in2, inb2); + in3 = Vector128_.UnpackLow(in3, inb3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -1083,67 +977,67 @@ internal static class LossyUtils // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + Vector128 a = in0.AsInt16() + in2.AsInt16(); + Vector128 b = in0.AsInt16() - in2.AsInt16(); - var k1 = Vector128.Create((short)20091); - var k2 = Vector128.Create((short)-30068); + Vector128 k1 = Vector128.Create((short)20091); + Vector128 k2 = Vector128.Create((short)-30068); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3.AsInt16(), c4); + Vector128 c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2); + Vector128 c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1); + Vector128 c3 = in1.AsInt16() - in3.AsInt16(); + Vector128 c4 = c1 - c2; + Vector128 c = c3.AsInt16() + c4; // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); + Vector128 d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1); + Vector128 d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2); + Vector128 d3 = in1.AsInt16() + in3.AsInt16(); + Vector128 d4 = d1 + d2; + Vector128 d = d3 + d4; // Second pass. - Vector128 tmp0 = Sse2.Add(a.AsInt16(), d); - Vector128 tmp1 = Sse2.Add(b.AsInt16(), c); - Vector128 tmp2 = Sse2.Subtract(b.AsInt16(), c); - Vector128 tmp3 = Sse2.Subtract(a.AsInt16(), d); + Vector128 tmp0 = a.AsInt16() + d; + Vector128 tmp1 = b.AsInt16() + c; + Vector128 tmp2 = b.AsInt16() - c; + Vector128 tmp3 = a.AsInt16() - d; // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4)); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); + Vector128 dc = t0.AsInt16() + Vector128.Create((short)4); + a = dc + t2.AsInt16(); + b = dc - t2.AsInt16(); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); + c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2); + c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1); + c3 = t1.AsInt16() - t3.AsInt16(); + c4 = c1 - c2; + c = c3 + c4; // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); + d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1); + d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2); + d3 = t1.AsInt16() + t3.AsInt16(); + d4 = d1 + d2; + d = d3 + d4; // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + tmp0 = a + d; + tmp1 = b + c; + tmp2 = b - c; + tmp3 = a - d; + Vector128 shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3); // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'dst' and store. // Load the reference(s). @@ -1155,22 +1049,22 @@ internal static class LossyUtils Vector128 dst3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3)), 0).AsByte(); // Convert to 16b. - dst0 = Sse2.UnpackLow(dst0, Vector128.Zero); - dst1 = Sse2.UnpackLow(dst1, Vector128.Zero); - dst2 = Sse2.UnpackLow(dst2, Vector128.Zero); - dst3 = Sse2.UnpackLow(dst3, Vector128.Zero); + dst0 = Vector128_.UnpackLow(dst0, Vector128.Zero); + dst1 = Vector128_.UnpackLow(dst1, Vector128.Zero); + dst2 = Vector128_.UnpackLow(dst2, Vector128.Zero); + dst3 = Vector128_.UnpackLow(dst3, Vector128.Zero); // Add the inverse transform(s). - dst0 = Sse2.Add(dst0.AsInt16(), t0.AsInt16()).AsByte(); - dst1 = Sse2.Add(dst1.AsInt16(), t1.AsInt16()).AsByte(); - dst2 = Sse2.Add(dst2.AsInt16(), t2.AsInt16()).AsByte(); - dst3 = Sse2.Add(dst3.AsInt16(), t3.AsInt16()).AsByte(); + dst0 = (dst0.AsInt16() + t0.AsInt16()).AsByte(); + dst1 = (dst1.AsInt16() + t1.AsInt16()).AsByte(); + dst2 = (dst2.AsInt16() + t2.AsInt16()).AsByte(); + dst3 = (dst3.AsInt16() + t3.AsInt16()).AsByte(); // Unsigned saturate to 8b. - dst0 = Sse2.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); - dst1 = Sse2.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); - dst2 = Sse2.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); - dst3 = Sse2.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); + dst0 = Vector128_.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); + dst1 = Vector128_.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); + dst2 = Vector128_.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); + dst3 = Vector128_.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); // Store the results. // Store eight bytes/pixels per line. @@ -1189,14 +1083,14 @@ internal static class LossyUtils public static void TransformOne(Span src, Span dst, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load and concatenate the transform coefficients. ref short srcRef = ref MemoryMarshal.GetReference(src); - var in0 = Vector128.Create(Unsafe.As(ref srcRef), 0); - var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 4)), 0); - var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 8)), 0); - var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 12)), 0); + Vector128 in0 = Vector128.Create(Unsafe.As(ref srcRef), 0); + Vector128 in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 4)), 0); + Vector128 in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 8)), 0); + Vector128 in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, 12)), 0); // a00 a10 a20 a30 x x x x // a01 a11 a21 a31 x x x x @@ -1205,102 +1099,102 @@ internal static class LossyUtils // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + Vector128 a = in0.AsInt16() + in2.AsInt16(); + Vector128 b = in0.AsInt16() - in2.AsInt16(); - var k1 = Vector128.Create((short)20091); - var k2 = Vector128.Create((short)-30068); + Vector128 k1 = Vector128.Create((short)20091); + Vector128 k2 = Vector128.Create((short)-30068); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3.AsInt16(), c4); + Vector128 c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2); + Vector128 c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1); + Vector128 c3 = in1.AsInt16() - in3.AsInt16(); + Vector128 c4 = c1 - c2; + Vector128 c = c3.AsInt16() + c4; // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); + Vector128 d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1); + Vector128 d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2); + Vector128 d3 = in1.AsInt16() + in3.AsInt16(); + Vector128 d4 = d1 + d2; + Vector128 d = d3 + d4; // Second pass. - Vector128 tmp0 = Sse2.Add(a.AsInt16(), d); - Vector128 tmp1 = Sse2.Add(b.AsInt16(), c); - Vector128 tmp2 = Sse2.Subtract(b.AsInt16(), c); - Vector128 tmp3 = Sse2.Subtract(a.AsInt16(), d); + Vector128 tmp0 = a.AsInt16() + d; + Vector128 tmp1 = b.AsInt16() + c; + Vector128 tmp2 = b.AsInt16() - c; + Vector128 tmp3 = a.AsInt16() - d; // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4)); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); + Vector128 dc = t0.AsInt16() + Vector128.Create((short)4); + a = dc + t2.AsInt16(); + b = dc - t2.AsInt16(); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); + c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2); + c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1); + c3 = t1.AsInt16() - t3.AsInt16(); + c4 = c1 - c2; + c = c3 + c4; // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); + d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1); + d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2); + d3 = t1.AsInt16() + t3.AsInt16(); + d4 = d1 + d2; + d = d3 + d4; // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + tmp0 = a + d; + tmp1 = b + c; + tmp2 = b - c; + tmp3 = a - d; + Vector128 shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3); // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'dst' and store. // Load the reference(s). // Load four bytes/pixels per line. ref byte dstRef = ref MemoryMarshal.GetReference(dst); - Vector128 dst0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref dstRef)).AsByte(); - Vector128 dst1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps))).AsByte(); - Vector128 dst2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 2))).AsByte(); - Vector128 dst3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3))).AsByte(); + Vector128 dst0 = Vector128.CreateScalar(Unsafe.As(ref dstRef)).AsByte(); + Vector128 dst1 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps))).AsByte(); + Vector128 dst2 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 2))).AsByte(); + Vector128 dst3 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3))).AsByte(); // Convert to 16b. - dst0 = Sse2.UnpackLow(dst0, Vector128.Zero); - dst1 = Sse2.UnpackLow(dst1, Vector128.Zero); - dst2 = Sse2.UnpackLow(dst2, Vector128.Zero); - dst3 = Sse2.UnpackLow(dst3, Vector128.Zero); + dst0 = Vector128_.UnpackLow(dst0, Vector128.Zero); + dst1 = Vector128_.UnpackLow(dst1, Vector128.Zero); + dst2 = Vector128_.UnpackLow(dst2, Vector128.Zero); + dst3 = Vector128_.UnpackLow(dst3, Vector128.Zero); // Add the inverse transform(s). - dst0 = Sse2.Add(dst0.AsInt16(), t0.AsInt16()).AsByte(); - dst1 = Sse2.Add(dst1.AsInt16(), t1.AsInt16()).AsByte(); - dst2 = Sse2.Add(dst2.AsInt16(), t2.AsInt16()).AsByte(); - dst3 = Sse2.Add(dst3.AsInt16(), t3.AsInt16()).AsByte(); + dst0 = (dst0.AsInt16() + t0.AsInt16()).AsByte(); + dst1 = (dst1.AsInt16() + t1.AsInt16()).AsByte(); + dst2 = (dst2.AsInt16() + t2.AsInt16()).AsByte(); + dst3 = (dst3.AsInt16() + t3.AsInt16()).AsByte(); // Unsigned saturate to 8b. - dst0 = Sse2.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); - dst1 = Sse2.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); - dst2 = Sse2.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); - dst3 = Sse2.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); + dst0 = Vector128_.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); + dst1 = Vector128_.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); + dst2 = Vector128_.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); + dst3 = Vector128_.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); // Store the results. // Store four bytes/pixels per line. ref byte outputRef = ref MemoryMarshal.GetReference(dst); - int output0 = Sse2.ConvertToInt32(dst0.AsInt32()); - int output1 = Sse2.ConvertToInt32(dst1.AsInt32()); - int output2 = Sse2.ConvertToInt32(dst2.AsInt32()); - int output3 = Sse2.ConvertToInt32(dst3.AsInt32()); + int output0 = dst0.AsInt32().ToScalar(); + int output1 = dst1.AsInt32().ToScalar(); + int output2 = dst2.AsInt32().ToScalar(); + int output3 = dst3.AsInt32().ToScalar(); Unsafe.As(ref outputRef) = output0; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; @@ -1413,7 +1307,7 @@ internal static class LossyUtils // Simple In-loop filtering (Paragraph 15.2) public static void SimpleVFilter16(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load. ref byte pRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), (uint)offset); @@ -1423,7 +1317,7 @@ internal static class LossyUtils Vector128 q0 = Unsafe.As>(ref pRef); Vector128 q1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)stride)); - DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh); + DoFilter2Vector128(ref p1, ref p0, ref q0, ref q1, thresh); // Store. ref byte outputRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), (uint)offset); @@ -1446,14 +1340,14 @@ internal static class LossyUtils public static void SimpleHFilter16(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Beginning of p1 ref byte pRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), (uint)(offset - 2)); - Load16x4(ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1); - DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh); - Store16x4(p1, p0, q0, q1, ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride); + Load16x4Vector128(ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1); + DoFilter2Vector128(ref p1, ref p0, ref q0, ref q1, thresh); + Store16x4Vector128(p1, p0, q0, q1, ref pRef, ref Unsafe.Add(ref pRef, 8 * (uint)stride), stride); } else { @@ -1471,7 +1365,7 @@ internal static class LossyUtils public static void SimpleVFilter16i(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { for (int k = 3; k > 0; k--) { @@ -1491,7 +1385,7 @@ internal static class LossyUtils public static void SimpleHFilter16i(Span p, int offset, int stride, int thresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { for (int k = 3; k > 0; k--) { @@ -1513,7 +1407,7 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter16(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); Vector128 t1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset - (4 * stride)))); @@ -1521,21 +1415,21 @@ internal static class LossyUtils Vector128 p1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset - (2 * stride)))); Vector128 p0 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset - stride))); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t1, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Vector128.Max(mask, AbsVector128(t1, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); Vector128 q0 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)offset)); Vector128 q1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + stride))); Vector128 q2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (2 * stride)))); t1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (3 * stride)))); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t1, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t1, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); // Store. ref byte outputRef = ref MemoryMarshal.GetReference(p); @@ -1555,27 +1449,27 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter16(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); ref byte bRef = ref Unsafe.Add(ref pRef, (uint)offset - 4); - Load16x4(ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); + Load16x4Vector128(ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); - Load16x4(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); + Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(q3, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(q3, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); - Store16x4(p3, p2, p1, p0, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); - Store16x4(q0, q1, q2, q3, ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride); + Store16x4Vector128(p3, p2, p1, p0, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); + Store16x4Vector128(q0, q1, q2, q3, ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride); } else { @@ -1585,7 +1479,7 @@ internal static class LossyUtils public static void VFilter16i(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); Vector128 p3 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)offset)); @@ -1599,23 +1493,23 @@ internal static class LossyUtils Span b = p[(offset + (2 * stride))..]; offset += 4 * stride; - Vector128 mask = Abs(p0, p1); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p0, p1); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); p3 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)offset)); p2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + stride))); Vector128 tmp1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (2 * stride)))); Vector128 tmp2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (3 * stride)))); - mask = Sse2.Max(mask, Abs(tmp1, tmp2)); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, tmp1)); + mask = Vector128.Max(mask, AbsVector128(tmp1, tmp2)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, tmp1)); // p3 and p2 are not just temporary variables here: they will be // re-used for next span. And q2/q3 will become p1/p0 accordingly. - ComplexMask(p1, p0, p3, p2, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); + ComplexMaskVector128(p1, p0, p3, p2, thresh, ithresh, ref mask); + DoFilter4Vector128(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); // Store. ref byte outputRef = ref MemoryMarshal.GetReference(b); @@ -1641,10 +1535,10 @@ internal static class LossyUtils public static void HFilter16i(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte pRef = ref MemoryMarshal.GetReference(p); - Load16x4(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); + Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); Vector128 mask; for (int k = 3; k > 0; k--) @@ -1656,20 +1550,20 @@ internal static class LossyUtils offset += 4; // Compute partial mask. - mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + mask = AbsVector128(p1, p0); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); - Load16x4(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out p3, out p2, out Vector128 tmp1, out Vector128 tmp2); + Load16x4Vector128(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out p3, out p2, out Vector128 tmp1, out Vector128 tmp2); - mask = Sse2.Max(mask, Abs(tmp1, tmp2)); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, tmp1)); + mask = Vector128.Max(mask, AbsVector128(tmp1, tmp2)); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, tmp1)); - ComplexMask(p1, p0, p3, p2, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); + ComplexMaskVector128(p1, p0, p3, p2, thresh, ithresh, ref mask); + DoFilter4Vector128(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); - Store16x4(p1, p0, p3, p2, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); + Store16x4Vector128(p1, p0, p3, p2, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); // Rotate samples. p1 = tmp1; @@ -1690,39 +1584,39 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter8(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load uv h-edges. ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - Vector128 t1 = LoadUvEdge(ref uRef, ref vRef, offset - (4 * stride)); - Vector128 p2 = LoadUvEdge(ref uRef, ref vRef, offset - (3 * stride)); - Vector128 p1 = LoadUvEdge(ref uRef, ref vRef, offset - (2 * stride)); - Vector128 p0 = LoadUvEdge(ref uRef, ref vRef, offset - stride); + Vector128 t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - (4 * stride)); + Vector128 p2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - (3 * stride)); + Vector128 p1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - (2 * stride)); + Vector128 p0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - stride); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t1, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Vector128.Max(mask, AbsVector128(t1, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); - Vector128 q0 = LoadUvEdge(ref uRef, ref vRef, offset); - Vector128 q1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); - Vector128 q2 = LoadUvEdge(ref uRef, ref vRef, offset + (2 * stride)); - t1 = LoadUvEdge(ref uRef, ref vRef, offset + (3 * stride)); + Vector128 q0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset); + Vector128 q1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + stride); + Vector128 q2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (2 * stride)); + t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (3 * stride)); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t1, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t1, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); // Store. - StoreUv(p2, ref uRef, ref vRef, offset - (3 * stride)); - StoreUv(p1, ref uRef, ref vRef, offset - (2 * stride)); - StoreUv(p0, ref uRef, ref vRef, offset - stride); - StoreUv(q0, ref uRef, ref vRef, offset); - StoreUv(q1, ref uRef, ref vRef, offset + (1 * stride)); - StoreUv(q2, ref uRef, ref vRef, offset + (2 * stride)); + StoreUvVector128(p2, ref uRef, ref vRef, offset - (3 * stride)); + StoreUvVector128(p1, ref uRef, ref vRef, offset - (2 * stride)); + StoreUvVector128(p0, ref uRef, ref vRef, offset - stride); + StoreUvVector128(q0, ref uRef, ref vRef, offset); + StoreUvVector128(q1, ref uRef, ref vRef, offset + (1 * stride)); + StoreUvVector128(q2, ref uRef, ref vRef, offset + (2 * stride)); } else { @@ -1734,27 +1628,27 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter8(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - Load16x4(ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); + Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Vector128.Max(mask, AbsVector128(p3, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); - Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); + Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(q3, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(q3, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter6Vector128(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); - Store16x4(p3, p2, p1, p0, ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride); - Store16x4(q0, q1, q2, q3, ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride); + Store16x4Vector128(p3, p2, p1, p0, ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride); + Store16x4Vector128(q0, q1, q2, q3, ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride); } else { @@ -1766,39 +1660,39 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter8i(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load uv h-edges. ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - Vector128 t2 = LoadUvEdge(ref uRef, ref vRef, offset); - Vector128 t1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); - Vector128 p1 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 2)); - Vector128 p0 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 3)); + Vector128 t2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset); + Vector128 t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + stride); + Vector128 p1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 2)); + Vector128 p0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 3)); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, p1)); offset += 4 * stride; - Vector128 q0 = LoadUvEdge(ref uRef, ref vRef, offset); - Vector128 q1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); - t1 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 2)); - t2 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 3)); + Vector128 q0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset); + Vector128 q1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + stride); + t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 2)); + t2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 3)); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter4Vector128(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); // Store. - StoreUv(p1, ref uRef, ref vRef, offset + (-2 * stride)); - StoreUv(p0, ref uRef, ref vRef, offset + (-1 * stride)); - StoreUv(q0, ref uRef, ref vRef, offset); - StoreUv(q1, ref uRef, ref vRef, offset + stride); + StoreUvVector128(p1, ref uRef, ref vRef, offset + (-2 * stride)); + StoreUvVector128(p0, ref uRef, ref vRef, offset + (-1 * stride)); + StoreUvVector128(q0, ref uRef, ref vRef, offset); + StoreUvVector128(q1, ref uRef, ref vRef, offset + stride); } else { @@ -1811,31 +1705,31 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter8i(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 t2, out Vector128 t1, out Vector128 p1, out Vector128 p0); + Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 t2, out Vector128 t1, out Vector128 p1, out Vector128 p0); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, p1)); // Beginning of q0. offset += 4; - Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out t1, out t2); + Load16x4Vector128(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out t1, out t2); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t2, t1)); + mask = Vector128.Max(mask, AbsVector128(t1, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); - DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter4Vector128(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); // Beginning of p1. offset -= 2; - Store16x4(p1, p0, q0, q1, ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride); + Store16x4Vector128(p1, p0, q0, q1, ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride); } else { @@ -1847,7 +1741,7 @@ internal static class LossyUtils public static void Mean16x4(Span input, Span dc) { - if (Ssse3.IsSupported) + if (Vector128.IsHardwareAccelerated) { Vector128 mean16x4Mask = Vector128.Create((short)0x00ff).AsByte(); @@ -1855,23 +1749,23 @@ internal static class LossyUtils Vector128 a1 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16))); Vector128 a2 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16))); Vector128 a3 = Unsafe.As>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 3, 16))); - Vector128 b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte - Vector128 b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); - Vector128 b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); - Vector128 b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); - Vector128 c0 = Sse2.And(a0, mean16x4Mask); // lo byte - Vector128 c1 = Sse2.And(a1, mean16x4Mask); - Vector128 c2 = Sse2.And(a2, mean16x4Mask); - Vector128 c3 = Sse2.And(a3, mean16x4Mask); - Vector128 d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); - Vector128 d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); - Vector128 d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); - Vector128 d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); - Vector128 e0 = Sse2.Add(d0, d1); - Vector128 e1 = Sse2.Add(d2, d3); - Vector128 f0 = Sse2.Add(e0, e1); - Vector128 hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16()); - Vector128 wide = Sse2.UnpackLow(hadd, Vector128.Zero).AsUInt32(); + Vector128 b0 = Vector128.ShiftRightLogical(a0.AsInt16(), 8); // hi byte + Vector128 b1 = Vector128.ShiftRightLogical(a1.AsInt16(), 8); + Vector128 b2 = Vector128.ShiftRightLogical(a2.AsInt16(), 8); + Vector128 b3 = Vector128.ShiftRightLogical(a3.AsInt16(), 8); + Vector128 c0 = a0 & mean16x4Mask; // lo byte + Vector128 c1 = a1 & mean16x4Mask; + Vector128 c2 = a2 & mean16x4Mask; + Vector128 c3 = a3 & mean16x4Mask; + Vector128 d0 = b0.AsInt32() + c0.AsInt32(); + Vector128 d1 = b1.AsInt32() + c1.AsInt32(); + Vector128 d2 = b2.AsInt32() + c2.AsInt32(); + Vector128 d3 = b3.AsInt32() + c3.AsInt32(); + Vector128 e0 = d0 + d1; + Vector128 e1 = d2 + d3; + Vector128 f0 = e0 + e1; + Vector128 hadd = Vector128_.HorizontalAdd(f0.AsInt16(), f0.AsInt16()); + Vector128 wide = Vector128_.UnpackLow(hadd, Vector128.Zero).AsUInt32(); ref uint outputRef = ref MemoryMarshal.GetReference(dc); Unsafe.As>(ref outputRef) = wide; @@ -1910,6 +1804,43 @@ internal static class LossyUtils // Cost of coding one event with probability 'proba'. public static int Vp8BitCost(int bit, byte proba) => bit == 0 ? WebpLookupTables.Vp8EntropyCost[proba] : WebpLookupTables.Vp8EntropyCost[255 - proba]; + /// + /// Reduces elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of all elements. + [MethodImpl(InliningOptions.ShortMethod)] + public static int ReduceSumVector256(Vector256 accumulator) + { + // Add upper lane to lower lane. + Vector128 vsum = accumulator.GetLower() + accumulator.GetUpper(); + + // Add odd to even. + vsum += Vector128_.ShuffleNative(vsum, 0b_11_11_01_01); + + // Add high to low. + vsum += Vector128_.ShuffleNative(vsum, 0b_11_10_11_10); + + return vsum.ToScalar(); + } + + /// + /// Reduces elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of all elements. + [MethodImpl(InliningOptions.ShortMethod)] + private static int ReduceSumVector128(Vector128 accumulator) + { + // Add odd to even. + Vector128 vsum = accumulator + Vector128_.ShuffleNative(accumulator, 0b_11_11_01_01); + + // Add high to low. + vsum += Vector128_.ShuffleNative(vsum, 0b_11_10_11_10); + + return vsum.ToScalar(); + } + [MethodImpl(InliningOptions.ShortMethod)] private static void Put16(int v, Span dst) { @@ -2015,144 +1946,144 @@ internal static class LossyUtils } // Applies filter on 2 pixels (p0 and q0) - private static void DoFilter2Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int thresh) + private static void DoFilter2Vector128(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int thresh) { - var signBit = Vector128.Create((byte)0x80); + Vector128 signBit = Vector128.Create((byte)0x80); // Convert p1/q1 to byte (for GetBaseDelta). - Vector128 p1s = Sse2.Xor(p1, signBit); - Vector128 q1s = Sse2.Xor(q1, signBit); - Vector128 mask = NeedsFilter(p1, p0, q0, q1, thresh); + Vector128 p1s = p1 ^ signBit; + Vector128 q1s = q1 ^ signBit; + Vector128 mask = NeedsFilterVector128(p1, p0, q0, q1, thresh); // Flip sign. - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); + p0 ^= signBit; + q0 ^= signBit; - Vector128 a = GetBaseDelta(p1s.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1s.AsSByte()).AsByte(); + Vector128 a = GetBaseDeltaVector128(p1s.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1s.AsSByte()).AsByte(); // Mask filter values we don't care about. - a = Sse2.And(a, mask); + a &= mask; - DoSimpleFilterSse2(ref p0, ref q0, a); + DoSimpleFilterVector128(ref p0, ref q0, a); // Flip sign. - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); + p0 ^= signBit; + q0 ^= signBit; } // Applies filter on 4 pixels (p1, p0, q0 and q1) - private static void DoFilter4Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, Vector128 mask, int tresh) + private static void DoFilter4Vector128(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, Vector128 mask, int tresh) { // Compute hev mask. - Vector128 notHev = GetNotHev(ref p1, ref p0, ref q0, ref q1, tresh); + Vector128 notHev = GetNotHevVector128(ref p1, ref p0, ref q0, ref q1, tresh); - var signBit = Vector128.Create((byte)0x80); + Vector128 signBit = Vector128.Create((byte)0x80); // Convert to signed values. - p1 = Sse2.Xor(p1, signBit); - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); - q1 = Sse2.Xor(q1, signBit); - - Vector128 t1 = Sse2.SubtractSaturate(p1.AsSByte(), q1.AsSByte()); // p1 - q1 - t1 = Sse2.AndNot(notHev, t1.AsByte()).AsSByte(); // hev(p1 - q1) - Vector128 t2 = Sse2.SubtractSaturate(q0.AsSByte(), p0.AsSByte()); // q0 - p0 - t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) - t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) - t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) - t1 = Sse2.And(t1.AsByte(), mask).AsSByte(); // mask filter values we don't care about. - - t2 = Sse2.AddSaturate(t1, Vector128.Create((byte)3).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 3 - Vector128 t3 = Sse2.AddSaturate(t1, Vector128.Create((byte)4).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 4 - t2 = SignedShift8b(t2.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 - t3 = SignedShift8b(t3.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 - p0 = Sse2.AddSaturate(p0.AsSByte(), t2).AsByte(); // p0 += t2 - q0 = Sse2.SubtractSaturate(q0.AsSByte(), t3).AsByte(); // q0 -= t3 - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); + p1 ^= signBit; + p0 ^= signBit; + q0 ^= signBit; + q1 ^= signBit; + + Vector128 t1 = Vector128_.SubtractSaturate(p1.AsSByte(), q1.AsSByte()); // p1 - q1 + t1 = (~notHev & t1.AsByte()).AsSByte(); // hev(p1 - q1) + Vector128 t2 = Vector128_.SubtractSaturate(q0.AsSByte(), p0.AsSByte()); // q0 - p0 + t1 = Vector128_.AddSaturate(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) + t1 = Vector128_.AddSaturate(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) + t1 = Vector128_.AddSaturate(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) + t1 = (t1.AsByte() & mask).AsSByte(); // mask filter values we don't care about. + + t2 = Vector128_.AddSaturate(t1, Vector128.Create((byte)3).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 3 + Vector128 t3 = Vector128_.AddSaturate(t1, Vector128.Create((byte)4).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 4 + t2 = SignedShift8bVector128(t2.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 + t3 = SignedShift8bVector128(t3.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 + p0 = Vector128_.AddSaturate(p0.AsSByte(), t2).AsByte(); // p0 += t2 + q0 = Vector128_.SubtractSaturate(q0.AsSByte(), t3).AsByte(); // q0 -= t3 + p0 ^= signBit; + q0 ^= signBit; // This is equivalent to signed (a + 1) >> 1 calculation. - t2 = Sse2.Add(t3, signBit.AsSByte()); - t3 = Sse2.Average(t2.AsByte(), Vector128.Zero).AsSByte(); - t3 = Sse2.Subtract(t3, Vector128.Create((sbyte)64)); + t2 = t3 + signBit.AsSByte(); + t3 = Vector128_.Average(t2.AsByte(), Vector128.Zero).AsSByte(); + t3 -= Vector128.Create((sbyte)64); - t3 = Sse2.And(notHev, t3.AsByte()).AsSByte(); // if !hev - q1 = Sse2.SubtractSaturate(q1.AsSByte(), t3).AsByte(); // q1 -= t3 - p1 = Sse2.AddSaturate(p1.AsSByte(), t3).AsByte(); // p1 += t3 - p1 = Sse2.Xor(p1.AsByte(), signBit); - q1 = Sse2.Xor(q1.AsByte(), signBit); + t3 = (notHev & t3.AsByte()).AsSByte(); // if !hev + q1 = Vector128_.SubtractSaturate(q1.AsSByte(), t3).AsByte(); // q1 -= t3 + p1 = Vector128_.AddSaturate(p1.AsSByte(), t3).AsByte(); // p1 += t3 + p1 = p1.AsByte() ^ signBit; + q1 = q1.AsByte() ^ signBit; } // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) - private static void DoFilter6Sse2(ref Vector128 p2, ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, ref Vector128 q2, Vector128 mask, int tresh) + private static void DoFilter6Vector128(ref Vector128 p2, ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, ref Vector128 q2, Vector128 mask, int tresh) { // Compute hev mask. - Vector128 notHev = GetNotHev(ref p1, ref p0, ref q0, ref q1, tresh); + Vector128 notHev = GetNotHevVector128(ref p1, ref p0, ref q0, ref q1, tresh); // Convert to signed values. - var signBit = Vector128.Create((byte)0x80); - p1 = Sse2.Xor(p1, signBit); - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); - q1 = Sse2.Xor(q1, signBit); - p2 = Sse2.Xor(p2, signBit); - q2 = Sse2.Xor(q2, signBit); + Vector128 signBit = Vector128.Create((byte)0x80); + p1 ^= signBit; + p0 ^= signBit; + q0 ^= signBit; + q1 ^= signBit; + p2 ^= signBit; + q2 ^= signBit; - Vector128 a = GetBaseDelta(p1.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1.AsSByte()); + Vector128 a = GetBaseDeltaVector128(p1.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1.AsSByte()); // Do simple filter on pixels with hev. - Vector128 m = Sse2.AndNot(notHev, mask); - Vector128 f = Sse2.And(a.AsByte(), m); - DoSimpleFilterSse2(ref p0, ref q0, f); + Vector128 m = ~notHev & mask; + Vector128 f = a.AsByte() & m; + DoSimpleFilterVector128(ref p0, ref q0, f); // Do strong filter on pixels with not hev. - m = Sse2.And(notHev, mask); - f = Sse2.And(a.AsByte(), m); - Vector128 flow = Sse2.UnpackLow(Vector128.Zero, f); - Vector128 fhigh = Sse2.UnpackHigh(Vector128.Zero, f); + m = notHev & mask; + f = a.AsByte() & m; + Vector128 flow = Vector128_.UnpackLow(Vector128.Zero, f); + Vector128 fhigh = Vector128_.UnpackHigh(Vector128.Zero, f); - var nine = Vector128.Create((short)0x0900); - Vector128 f9Low = Sse2.MultiplyHigh(flow.AsInt16(), nine); // Filter (lo) * 9 - Vector128 f9High = Sse2.MultiplyHigh(fhigh.AsInt16(), nine); // Filter (hi) * 9 + Vector128 nine = Vector128.Create((short)0x0900); + Vector128 f9Low = Vector128_.MultiplyHigh(flow.AsInt16(), nine); // Filter (lo) * 9 + Vector128 f9High = Vector128_.MultiplyHigh(fhigh.AsInt16(), nine); // Filter (hi) * 9 - var sixtyThree = Vector128.Create((short)63); - Vector128 a2Low = Sse2.Add(f9Low, sixtyThree); // Filter * 9 + 63 - Vector128 a2High = Sse2.Add(f9High, sixtyThree); // Filter * 9 + 63 + Vector128 sixtyThree = Vector128.Create((short)63); + Vector128 a2Low = f9Low + sixtyThree; // Filter * 9 + 63 + Vector128 a2High = f9High + sixtyThree; // Filter * 9 + 63 - Vector128 a1Low = Sse2.Add(a2Low, f9Low); // Filter * 18 + 63 - Vector128 a1High = Sse2.Add(a2High, f9High); // // Filter * 18 + 63 + Vector128 a1Low = a2Low + f9Low; // Filter * 18 + 63 + Vector128 a1High = a2High + f9High; // // Filter * 18 + 63 - Vector128 a0Low = Sse2.Add(a1Low, f9Low); // Filter * 27 + 63 - Vector128 a0High = Sse2.Add(a1High, f9High); // Filter * 27 + 63 + Vector128 a0Low = a1Low + f9Low; // Filter * 27 + 63 + Vector128 a0High = a1High + f9High; // Filter * 27 + 63 - Update2Pixels(ref p2, ref q2, a2Low, a2High); - Update2Pixels(ref p1, ref q1, a1Low, a1High); - Update2Pixels(ref p0, ref q0, a0Low, a0High); + Update2PixelsVector128(ref p2, ref q2, a2Low, a2High); + Update2PixelsVector128(ref p1, ref q1, a1Low, a1High); + Update2PixelsVector128(ref p0, ref q0, a0Low, a0High); } - private static void DoSimpleFilterSse2(ref Vector128 p0, ref Vector128 q0, Vector128 fl) + private static void DoSimpleFilterVector128(ref Vector128 p0, ref Vector128 q0, Vector128 fl) { - Vector128 v3 = Sse2.AddSaturate(fl.AsSByte(), Vector128.Create((byte)3).AsSByte()); - Vector128 v4 = Sse2.AddSaturate(fl.AsSByte(), Vector128.Create((byte)4).AsSByte()); + Vector128 v3 = Vector128_.AddSaturate(fl.AsSByte(), Vector128.Create((byte)3).AsSByte()); + Vector128 v4 = Vector128_.AddSaturate(fl.AsSByte(), Vector128.Create((byte)4).AsSByte()); - v4 = SignedShift8b(v4.AsByte()).AsSByte(); // v4 >> 3 - v3 = SignedShift8b(v3.AsByte()).AsSByte(); // v3 >> 3 - q0 = Sse2.SubtractSaturate(q0.AsSByte(), v4).AsByte(); // q0 -= v4 - p0 = Sse2.AddSaturate(p0.AsSByte(), v3).AsByte(); // p0 += v3 + v4 = SignedShift8bVector128(v4.AsByte()).AsSByte(); // v4 >> 3 + v3 = SignedShift8bVector128(v3.AsByte()).AsSByte(); // v3 >> 3 + q0 = Vector128_.SubtractSaturate(q0.AsSByte(), v4).AsByte(); // q0 -= v4 + p0 = Vector128_.AddSaturate(p0.AsSByte(), v3).AsByte(); // p0 += v3 } - private static Vector128 GetNotHev(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int hevThresh) + private static Vector128 GetNotHevVector128(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int hevThresh) { - Vector128 t1 = Abs(p1, p0); - Vector128 t2 = Abs(q1, q0); + Vector128 t1 = AbsVector128(p1, p0); + Vector128 t2 = AbsVector128(q1, q0); - var h = Vector128.Create((byte)hevThresh); - Vector128 tMax = Sse2.Max(t1, t2); + Vector128 h = Vector128.Create((byte)hevThresh); + Vector128 tMax = Vector128.Max(t1, t2); - Vector128 tMaxH = Sse2.SubtractSaturate(tMax, h); + Vector128 tMaxH = Vector128_.SubtractSaturate(tMax, h); // not_hev <= t1 && not_hev <= t2 - return Sse2.CompareEqual(tMaxH, Vector128.Zero); + return Vector128.Equals(tMaxH, Vector128.Zero); } // Applies filter on 4 pixels (p1, p0, q0 and q1) @@ -2233,24 +2164,24 @@ internal static class LossyUtils WebpLookupTables.Abs0(q2 - q1) <= it && WebpLookupTables.Abs0(q1 - q0) <= it; } - private static Vector128 NeedsFilter(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh) + private static Vector128 NeedsFilterVector128(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh) { - var mthresh = Vector128.Create((byte)thresh); - Vector128 t1 = Abs(p1, q1); // abs(p1 - q1) - var fe = Vector128.Create((byte)0xFE); - Vector128 t2 = Sse2.And(t1, fe); // set lsb of each byte to zero. - Vector128 t3 = Sse2.ShiftRightLogical(t2.AsInt16(), 1); // abs(p1 - q1) / 2 + Vector128 mthresh = Vector128.Create((byte)thresh); + Vector128 t1 = AbsVector128(p1, q1); // abs(p1 - q1) + Vector128 fe = Vector128.Create((byte)0xFE); + Vector128 t2 = t1 & fe; // set lsb of each byte to zero. + Vector128 t3 = Vector128.ShiftRightLogical(t2.AsInt16(), 1); // abs(p1 - q1) / 2 - Vector128 t4 = Abs(p0, q0); // abs(p0 - q0) - Vector128 t5 = Sse2.AddSaturate(t4, t4); // abs(p0 - q0) * 2 - Vector128 t6 = Sse2.AddSaturate(t5.AsByte(), t3.AsByte()); // abs(p0-q0)*2 + abs(p1-q1)/2 + Vector128 t4 = AbsVector128(p0, q0); // abs(p0 - q0) + Vector128 t5 = Vector128_.AddSaturate(t4, t4); // abs(p0 - q0) * 2 + Vector128 t6 = Vector128_.AddSaturate(t5.AsByte(), t3.AsByte()); // abs(p0-q0)*2 + abs(p1-q1)/2 - Vector128 t7 = Sse2.SubtractSaturate(t6, mthresh.AsByte()); // mask <= m_thresh + Vector128 t7 = Vector128_.SubtractSaturate(t6, mthresh.AsByte()); // mask <= m_thresh - return Sse2.CompareEqual(t7, Vector128.Zero); + return Vector128.Equals(t7, Vector128.Zero); } - private static void Load16x4(ref byte r0, ref byte r8, int stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1) + private static void Load16x4Vector128(ref byte r0, ref byte r8, int stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1) { // Assume the pixels around the edge (|) are numbered as follows // 00 01 | 02 03 @@ -2267,21 +2198,21 @@ internal static class LossyUtils // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - Load8x4(ref r0, (uint)stride, out Vector128 t1, out Vector128 t2); - Load8x4(ref r8, (uint)stride, out p0, out q1); + Load8x4Vector128(ref r0, (uint)stride, out Vector128 t1, out Vector128 t2); + Load8x4Vector128(ref r8, (uint)stride, out p0, out q1); // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - p1 = Sse2.UnpackLow(t1.AsInt64(), p0.AsInt64()).AsByte(); - p0 = Sse2.UnpackHigh(t1.AsInt64(), p0.AsInt64()).AsByte(); - q0 = Sse2.UnpackLow(t2.AsInt64(), q1.AsInt64()).AsByte(); - q1 = Sse2.UnpackHigh(t2.AsInt64(), q1.AsInt64()).AsByte(); + p1 = Vector128_.UnpackLow(t1.AsInt64(), p0.AsInt64()).AsByte(); + p0 = Vector128_.UnpackHigh(t1.AsInt64(), p0.AsInt64()).AsByte(); + q0 = Vector128_.UnpackLow(t2.AsInt64(), q1.AsInt64()).AsByte(); + q1 = Vector128_.UnpackHigh(t2.AsInt64(), q1.AsInt64()).AsByte(); } // Reads 8 rows across a vertical edge. - private static void Load8x4(ref byte bRef, nuint stride, out Vector128 p, out Vector128 q) + private static void Load8x4Vector128(ref byte bRef, nuint stride, out Vector128 p, out Vector128 q) { // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00 // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10 @@ -2298,125 +2229,123 @@ internal static class LossyUtils // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 - Vector128 b0 = Sse2.UnpackLow(a0.AsSByte(), a1.AsSByte()); - Vector128 b1 = Sse2.UnpackHigh(a0.AsSByte(), a1.AsSByte()); + Vector128 b0 = Vector128_.UnpackLow(a0.AsSByte(), a1.AsSByte()); + Vector128 b1 = Vector128_.UnpackHigh(a0.AsSByte(), a1.AsSByte()); // C0 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 // C1 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 - Vector128 c0 = Sse2.UnpackLow(b0.AsInt16(), b1.AsInt16()); - Vector128 c1 = Sse2.UnpackHigh(b0.AsInt16(), b1.AsInt16()); + Vector128 c0 = Vector128_.UnpackLow(b0.AsInt16(), b1.AsInt16()); + Vector128 c1 = Vector128_.UnpackHigh(b0.AsInt16(), b1.AsInt16()); // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - p = Sse2.UnpackLow(c0.AsInt32(), c1.AsInt32()).AsByte(); - q = Sse2.UnpackHigh(c0.AsInt32(), c1.AsInt32()).AsByte(); + p = Vector128_.UnpackLow(c0.AsInt32(), c1.AsInt32()).AsByte(); + q = Vector128_.UnpackHigh(c0.AsInt32(), c1.AsInt32()).AsByte(); } // Transpose back and store - private static void Store16x4(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, ref byte r0Ref, ref byte r8Ref, int stride) + private static void Store16x4Vector128(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, ref byte r0Ref, ref byte r8Ref, int stride) { // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - Vector128 p0s = Sse2.UnpackLow(p1, p0); - Vector128 p1s = Sse2.UnpackHigh(p1, p0); + Vector128 p0s = Vector128_.UnpackLow(p1, p0); + Vector128 p1s = Vector128_.UnpackHigh(p1, p0); // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - Vector128 q0s = Sse2.UnpackLow(q0, q1); - Vector128 q1s = Sse2.UnpackHigh(q0, q1); + Vector128 q0s = Vector128_.UnpackLow(q0, q1); + Vector128 q1s = Vector128_.UnpackHigh(q0, q1); // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 Vector128 t1 = p0s; - p0s = Sse2.UnpackLow(t1.AsInt16(), q0s.AsInt16()).AsByte(); - q0s = Sse2.UnpackHigh(t1.AsInt16(), q0s.AsInt16()).AsByte(); + p0s = Vector128_.UnpackLow(t1.AsInt16(), q0s.AsInt16()).AsByte(); + q0s = Vector128_.UnpackHigh(t1.AsInt16(), q0s.AsInt16()).AsByte(); // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 t1 = p1s; - p1s = Sse2.UnpackLow(t1.AsInt16(), q1s.AsInt16()).AsByte(); - q1s = Sse2.UnpackHigh(t1.AsInt16(), q1s.AsInt16()).AsByte(); + p1s = Vector128_.UnpackLow(t1.AsInt16(), q1s.AsInt16()).AsByte(); + q1s = Vector128_.UnpackHigh(t1.AsInt16(), q1s.AsInt16()).AsByte(); - Store4x4(p0s, ref r0Ref, stride); - Store4x4(q0s, ref Unsafe.Add(ref r0Ref, 4 * (uint)stride), stride); + Store4x4Vector128(p0s, ref r0Ref, stride); + Store4x4Vector128(q0s, ref Unsafe.Add(ref r0Ref, 4 * (uint)stride), stride); - Store4x4(p1s, ref r8Ref, stride); - Store4x4(q1s, ref Unsafe.Add(ref r8Ref, 4 * (uint)stride), stride); + Store4x4Vector128(p1s, ref r8Ref, stride); + Store4x4Vector128(q1s, ref Unsafe.Add(ref r8Ref, 4 * (uint)stride), stride); } - private static void Store4x4(Vector128 x, ref byte dstRef, int stride) + private static void Store4x4Vector128(Vector128 x, ref byte dstRef, int stride) { int offset = 0; for (int i = 0; i < 4; i++) { - Unsafe.As(ref Unsafe.Add(ref dstRef, (uint)offset)) = Sse2.ConvertToInt32(x.AsInt32()); - x = Sse2.ShiftRightLogical128BitLane(x, 4); + Unsafe.As(ref Unsafe.Add(ref dstRef, (uint)offset)) = x.AsInt32().ToScalar(); + x = Vector128_.ShiftRightBytesInVector(x, 4); offset += stride; } } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 GetBaseDelta(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1) + private static Vector128 GetBaseDeltaVector128(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1) { // Beware of addition order, for saturation! - Vector128 p1q1 = Sse2.SubtractSaturate(p1, q1); // p1 - q1 - Vector128 q0p0 = Sse2.SubtractSaturate(q0, p0); // q0 - p0 - Vector128 s1 = Sse2.AddSaturate(p1q1, q0p0); // p1 - q1 + 1 * (q0 - p0) - Vector128 s2 = Sse2.AddSaturate(q0p0, s1); // p1 - q1 + 2 * (q0 - p0) - Vector128 s3 = Sse2.AddSaturate(q0p0, s2); // p1 - q1 + 3 * (q0 - p0) - - return s3; + Vector128 p1q1 = Vector128_.SubtractSaturate(p1, q1); // p1 - q1 + Vector128 q0p0 = Vector128_.SubtractSaturate(q0, p0); // q0 - p0 + Vector128 s1 = Vector128_.AddSaturate(p1q1, q0p0); // p1 - q1 + 1 * (q0 - p0) + Vector128 s2 = Vector128_.AddSaturate(q0p0, s1); // p1 - q1 + 2 * (q0 - p0) + return Vector128_.AddSaturate(q0p0, s2); // p1 - q1 + 3 * (q0 - p0) } // Shift each byte of "x" by 3 bits while preserving by the sign bit. [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 SignedShift8b(Vector128 x) + private static Vector128 SignedShift8bVector128(Vector128 x) { - Vector128 low0 = Sse2.UnpackLow(Vector128.Zero, x); - Vector128 high0 = Sse2.UnpackHigh(Vector128.Zero, x); - Vector128 low1 = Sse2.ShiftRightArithmetic(low0.AsInt16(), 3 + 8); - Vector128 high1 = Sse2.ShiftRightArithmetic(high0.AsInt16(), 3 + 8); + Vector128 low0 = Vector128_.UnpackLow(Vector128.Zero, x); + Vector128 high0 = Vector128_.UnpackHigh(Vector128.Zero, x); + Vector128 low1 = Vector128.ShiftRightArithmetic(low0.AsInt16(), 3 + 8); + Vector128 high1 = Vector128.ShiftRightArithmetic(high0.AsInt16(), 3 + 8); - return Sse2.PackSignedSaturate(low1, high1); + return Vector128_.PackSignedSaturate(low1, high1); } [MethodImpl(InliningOptions.ShortMethod)] - private static void ComplexMask(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh, int ithresh, ref Vector128 mask) + private static void ComplexMaskVector128(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh, int ithresh, ref Vector128 mask) { - var it = Vector128.Create((byte)ithresh); - Vector128 diff = Sse2.SubtractSaturate(mask, it); - Vector128 threshMask = Sse2.CompareEqual(diff, Vector128.Zero); - Vector128 filterMask = NeedsFilter(p1, p0, q0, q1, thresh); + Vector128 it = Vector128.Create((byte)ithresh); + Vector128 diff = Vector128_.SubtractSaturate(mask, it); + Vector128 threshMask = Vector128.Equals(diff, Vector128.Zero); + Vector128 filterMask = NeedsFilterVector128(p1, p0, q0, q1, thresh); - mask = Sse2.And(threshMask, filterMask); + mask = threshMask & filterMask; } // Updates values of 2 pixels at MB edge during complex filtering. // Update operations: // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)] // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip). - private static void Update2Pixels(ref Vector128 pi, ref Vector128 qi, Vector128 a0Low, Vector128 a0High) + private static void Update2PixelsVector128(ref Vector128 pi, ref Vector128 qi, Vector128 a0Low, Vector128 a0High) { - var signBit = Vector128.Create((byte)0x80); - Vector128 a1Low = Sse2.ShiftRightArithmetic(a0Low, 7); - Vector128 a1High = Sse2.ShiftRightArithmetic(a0High, 7); - Vector128 delta = Sse2.PackSignedSaturate(a1Low, a1High); - pi = Sse2.AddSaturate(pi.AsSByte(), delta).AsByte(); - qi = Sse2.SubtractSaturate(qi.AsSByte(), delta).AsByte(); - pi = Sse2.Xor(pi, signBit.AsByte()); - qi = Sse2.Xor(qi, signBit.AsByte()); + Vector128 signBit = Vector128.Create((byte)0x80); + Vector128 a1Low = Vector128.ShiftRightArithmetic(a0Low, 7); + Vector128 a1High = Vector128.ShiftRightArithmetic(a0High, 7); + Vector128 delta = Vector128_.PackSignedSaturate(a1Low, a1High); + pi = Vector128_.AddSaturate(pi.AsSByte(), delta).AsByte(); + qi = Vector128_.SubtractSaturate(qi.AsSByte(), delta).AsByte(); + pi ^= signBit.AsByte(); + qi ^= signBit.AsByte(); } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 LoadUvEdge(ref byte uRef, ref byte vRef, int offset) + private static Vector128 LoadUvEdgeVector128(ref byte uRef, ref byte vRef, int offset) { - var uVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref uRef, (uint)offset)), 0); - var vVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref vRef, (uint)offset)), 0); - return Sse2.UnpackLow(uVec, vVec).AsByte(); + Vector128 uVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref uRef, (uint)offset)), 0); + Vector128 vVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref vRef, (uint)offset)), 0); + return Vector128_.UnpackLow(uVec, vVec).AsByte(); } [MethodImpl(InliningOptions.ShortMethod)] - private static void StoreUv(Vector128 x, ref byte uRef, ref byte vRef, int offset) + private static void StoreUvVector128(Vector128 x, ref byte uRef, ref byte vRef, int offset) { Unsafe.As>(ref Unsafe.Add(ref uRef, (uint)offset)) = x.GetLower(); Unsafe.As>(ref Unsafe.Add(ref vRef, (uint)offset)) = x.GetUpper(); @@ -2424,8 +2353,8 @@ internal static class LossyUtils // Compute abs(p - q) = subs(p - q) OR subs(q - p) [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 Abs(Vector128 p, Vector128 q) - => Sse2.Or(Sse2.SubtractSaturate(q, p), Sse2.SubtractSaturate(p, q)); + private static Vector128 AbsVector128(Vector128 p, Vector128 q) + => Vector128_.SubtractSaturate(q, p) | Vector128_.SubtractSaturate(p, q); [MethodImpl(InliningOptions.ShortMethod)] private static bool Hev(Span p, int offset, int step, int thresh) @@ -2474,5 +2403,5 @@ internal static class LossyUtils private static void Memset(Span dst, byte value, int startIdx, int count) => dst.Slice(startIdx, count).Fill(value); [MethodImpl(InliningOptions.ShortMethod)] - private static int Clamp255(int x) => x < 0 ? 0 : x > 255 ? 255 : x; + private static int Clamp255(int x) => Numerics.Clamp(x, 0, 255); } diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index c645816d4..7fe71588c 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -2,10 +2,11 @@ // Licensed under the Six Labors Split License. using System.Buffers.Binary; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; namespace SixLabors.ImageSharp.Formats.Webp.Lossy; @@ -78,7 +79,7 @@ internal static unsafe class Vp8Encoding // Does two inverse transforms. public static void ITransformTwo(Span reference, Span input, Span dst, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: @@ -116,10 +117,10 @@ internal static unsafe class Vp8Encoding Vector128 inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); Vector128 inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); - in0 = Sse2.UnpackLow(in0, inb0); - in1 = Sse2.UnpackLow(in1, inb1); - in2 = Sse2.UnpackLow(in2, inb2); - in3 = Sse2.UnpackLow(in3, inb3); + in0 = Vector128_.UnpackLow(in0, inb0); + in1 = Vector128_.UnpackLow(in1, inb1); + in2 = Vector128_.UnpackLow(in2, inb2); + in3 = Vector128_.UnpackLow(in3, inb3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -128,49 +129,45 @@ internal static unsafe class Vp8Encoding // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); + InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); + InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'ref' and store. // Load the reference(s). - Vector128 ref0 = Vector128.Zero; - Vector128 ref1 = Vector128.Zero; - Vector128 ref2 = Vector128.Zero; - Vector128 ref3 = Vector128.Zero; ref byte referenceRef = ref MemoryMarshal.GetReference(reference); // Load eight bytes/pixels per line. - ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); - ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); - ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); - ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); + Vector128 ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); + Vector128 ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); + Vector128 ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); + Vector128 ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); // Convert to 16b. - ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); - ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); - ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); - ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + ref0 = Vector128_.UnpackLow(ref0, Vector128.Zero); + ref1 = Vector128_.UnpackLow(ref1, Vector128.Zero); + ref2 = Vector128_.UnpackLow(ref2, Vector128.Zero); + ref3 = Vector128_.UnpackLow(ref3, Vector128.Zero); // Add the inverse transform(s). - Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); - Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); - Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); - Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); + Vector128 ref0InvAdded = ref0.AsInt16() + t0.AsInt16(); + Vector128 ref1InvAdded = ref1.AsInt16() + t1.AsInt16(); + Vector128 ref2InvAdded = ref2.AsInt16() + t2.AsInt16(); + Vector128 ref3InvAdded = ref3.AsInt16() + t3.AsInt16(); // Unsigned saturate to 8b. - ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); - ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); - ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); - ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); + ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); // Store eight bytes/pixels per line. ref byte outputRef = ref MemoryMarshal.GetReference(dst); @@ -188,7 +185,7 @@ internal static unsafe class Vp8Encoding public static void ITransformOne(Span reference, Span input, Span dst, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load and concatenate the transform coefficients (we'll do two inverse // transforms in parallel). In the case of only one inverse transform, the @@ -207,63 +204,59 @@ internal static unsafe class Vp8Encoding // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); + InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); + InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'ref' and store. // Load the reference(s). - Vector128 ref0 = Vector128.Zero; - Vector128 ref1 = Vector128.Zero; - Vector128 ref2 = Vector128.Zero; - Vector128 ref3 = Vector128.Zero; ref byte referenceRef = ref MemoryMarshal.GetReference(reference); // Load four bytes/pixels per line. - ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); - ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); - ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); - ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); + Vector128 ref0 = Vector128.CreateScalar(Unsafe.ReadUnaligned(ref referenceRef)).AsByte(); + Vector128 ref1 = Vector128.CreateScalar(Unsafe.ReadUnaligned(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); + Vector128 ref2 = Vector128.CreateScalar(Unsafe.ReadUnaligned(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); + Vector128 ref3 = Vector128.CreateScalar(Unsafe.ReadUnaligned(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); // Convert to 16b. - ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); - ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); - ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); - ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + ref0 = Vector128_.UnpackLow(ref0, Vector128.Zero); + ref1 = Vector128_.UnpackLow(ref1, Vector128.Zero); + ref2 = Vector128_.UnpackLow(ref2, Vector128.Zero); + ref3 = Vector128_.UnpackLow(ref3, Vector128.Zero); // Add the inverse transform(s). - Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); - Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); - Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); - Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); + Vector128 ref0InvAdded = ref0.AsInt16() + t0.AsInt16(); + Vector128 ref1InvAdded = ref1.AsInt16() + t1.AsInt16(); + Vector128 ref2InvAdded = ref2.AsInt16() + t2.AsInt16(); + Vector128 ref3InvAdded = ref3.AsInt16() + t3.AsInt16(); // Unsigned saturate to 8b. - ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); - ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); - ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); - ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); + ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); // Unsigned saturate to 8b. ref byte outputRef = ref MemoryMarshal.GetReference(dst); // Store four bytes/pixels per line. - int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); - int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); - int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); - int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); - - Unsafe.As(ref outputRef) = output0; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; + int output0 = ref0.AsInt32().ToScalar(); + int output1 = ref1.AsInt32().ToScalar(); + int output2 = ref2.AsInt32().ToScalar(); + int output3 = ref3.AsInt32().ToScalar(); + + Unsafe.WriteUnaligned(ref outputRef, output0); + Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps), output1); + Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2), output2); + Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3), output3); } else { @@ -302,72 +295,72 @@ internal static unsafe class Vp8Encoding } } - private static void InverseTransformVerticalPass(Vector128 in0, Vector128 in2, Vector128 in1, Vector128 in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3) + private static void InverseTransformVerticalPassVector128(Vector128 in0, Vector128 in2, Vector128 in1, Vector128 in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3) { - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + Vector128 a = in0.AsInt16() + in2.AsInt16(); + Vector128 b = in0.AsInt16() - in2.AsInt16(); Vector128 k1 = Vector128.Create((short)20091).AsInt16(); Vector128 k2 = Vector128.Create((short)-30068).AsInt16(); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3, c4); + Vector128 c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2); + Vector128 c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1); + Vector128 c3 = in1.AsInt16() - in3.AsInt16(); + Vector128 c4 = c1 - c2; + Vector128 c = c3 + c4; // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); + Vector128 d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1); + Vector128 d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2); + Vector128 d3 = in1.AsInt16() + in3.AsInt16(); + Vector128 d4 = d1 + d2; + Vector128 d = d3 + d4; // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); + tmp0 = a + d; + tmp1 = b + c; + tmp2 = b - c; + tmp3 = a - d; } - private static void InverseTransformHorizontalPass(Vector128 t0, Vector128 t2, Vector128 t1, Vector128 t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3) + private static void InverseTransformHorizontalPassVector128(Vector128 t0, Vector128 t2, Vector128 t1, Vector128 t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3) { - Vector128 dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4)); - Vector128 a = Sse2.Add(dc, t2.AsInt16()); - Vector128 b = Sse2.Subtract(dc, t2.AsInt16()); + Vector128 dc = t0.AsInt16() + Vector128.Create((short)4); + Vector128 a = dc + t2.AsInt16(); + Vector128 b = dc - t2.AsInt16(); Vector128 k1 = Vector128.Create((short)20091).AsInt16(); Vector128 k2 = Vector128.Create((short)-30068).AsInt16(); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - Vector128 c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2); - Vector128 c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1); - Vector128 c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3, c4); + Vector128 c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2); + Vector128 c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1); + Vector128 c3 = t1.AsInt16() - t3.AsInt16(); + Vector128 c4 = c1 - c2; + Vector128 c = c3 + c4; // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - Vector128 d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1); - Vector128 d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2); - Vector128 d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); + Vector128 d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1); + Vector128 d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2); + Vector128 d3 = t1.AsInt16() + t3.AsInt16(); + Vector128 d4 = d1 + d2; + Vector128 d = d3 + d4; // Second pass. - Vector128 tmp0 = Sse2.Add(a, d); - Vector128 tmp1 = Sse2.Add(b, c); - Vector128 tmp2 = Sse2.Subtract(b, c); - Vector128 tmp3 = Sse2.Subtract(a, d); - shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + Vector128 tmp0 = a + d; + Vector128 tmp1 = b + c; + Vector128 tmp2 = b - c; + Vector128 tmp3 = a - d; + shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3); + shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3); + shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3); + shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3); } public static void FTransform2(Span src, Span reference, Span output, Span output2, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte srcRef = ref MemoryMarshal.GetReference(src); ref byte referenceRef = ref MemoryMarshal.GetReference(reference); @@ -385,38 +378,38 @@ internal static unsafe class Vp8Encoding Vector128 ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0); // Convert both to 16 bit. - Vector128 srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128.Zero); - Vector128 srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128.Zero); - Vector128 srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128.Zero); - Vector128 srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128.Zero); - Vector128 refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128.Zero); - Vector128 refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128.Zero); - Vector128 refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128.Zero); - Vector128 refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128.Zero); + Vector128 srcLow0 = Vector128_.UnpackLow(src0.AsByte(), Vector128.Zero); + Vector128 srcLow1 = Vector128_.UnpackLow(src1.AsByte(), Vector128.Zero); + Vector128 srcLow2 = Vector128_.UnpackLow(src2.AsByte(), Vector128.Zero); + Vector128 srcLow3 = Vector128_.UnpackLow(src3.AsByte(), Vector128.Zero); + Vector128 refLow0 = Vector128_.UnpackLow(ref0.AsByte(), Vector128.Zero); + Vector128 refLow1 = Vector128_.UnpackLow(ref1.AsByte(), Vector128.Zero); + Vector128 refLow2 = Vector128_.UnpackLow(ref2.AsByte(), Vector128.Zero); + Vector128 refLow3 = Vector128_.UnpackLow(ref3.AsByte(), Vector128.Zero); // Compute difference. -> 00 01 02 03 00' 01' 02' 03' - Vector128 diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16()); - Vector128 diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16()); - Vector128 diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16()); - Vector128 diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16()); + Vector128 diff0 = srcLow0.AsInt16() - refLow0.AsInt16(); + Vector128 diff1 = srcLow1.AsInt16() - refLow1.AsInt16(); + Vector128 diff2 = srcLow2.AsInt16() - refLow2.AsInt16(); + Vector128 diff3 = srcLow3.AsInt16() - refLow3.AsInt16(); // Unpack and shuffle. // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 - Vector128 shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32()); - Vector128 shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32()); - Vector128 shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32()); - Vector128 shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32()); + Vector128 shuf01l = Vector128_.UnpackLow(diff0.AsInt32(), diff1.AsInt32()); + Vector128 shuf23l = Vector128_.UnpackLow(diff2.AsInt32(), diff3.AsInt32()); + Vector128 shuf01h = Vector128_.UnpackHigh(diff0.AsInt32(), diff1.AsInt32()); + Vector128 shuf23h = Vector128_.UnpackHigh(diff2.AsInt32(), diff3.AsInt32()); // First pass. - FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128 v01l, out Vector128 v32l); - FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128 v01h, out Vector128 v32h); + FTransformPass1Vector128(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128 v01l, out Vector128 v32l); + FTransformPass1Vector128(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128 v01h, out Vector128 v32h); // Second pass. - FTransformPass2SSE2(v01l, v32l, output); - FTransformPass2SSE2(v01h, v32h, output2); + FTransformPass2Vector128(v01l, v32l, output); + FTransformPass2Vector128(v01h, v32h, output2); } else { @@ -427,7 +420,7 @@ internal static unsafe class Vp8Encoding public static void FTransform(Span src, Span reference, Span output, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { ref byte srcRef = ref MemoryMarshal.GetReference(src); ref byte referenceRef = ref MemoryMarshal.GetReference(reference); @@ -449,29 +442,29 @@ internal static unsafe class Vp8Encoding // 20 21 22 23 * // 30 31 32 33 * // Shuffle. - Vector128 srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16()); - Vector128 srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16()); - Vector128 refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16()); - Vector128 refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16()); + Vector128 srcLow0 = Vector128_.UnpackLow(src0.AsInt16(), src1.AsInt16()); + Vector128 srcLow1 = Vector128_.UnpackLow(src2.AsInt16(), src3.AsInt16()); + Vector128 refLow0 = Vector128_.UnpackLow(ref0.AsInt16(), ref1.AsInt16()); + Vector128 refLow1 = Vector128_.UnpackLow(ref2.AsInt16(), ref3.AsInt16()); // 00 01 10 11 02 03 12 13 * * ... // 20 21 30 31 22 22 32 33 * * ... // Convert both to 16 bit. - Vector128 src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128.Zero); - Vector128 src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128.Zero); - Vector128 ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128.Zero); - Vector128 ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128.Zero); + Vector128 src0_16b = Vector128_.UnpackLow(srcLow0.AsByte(), Vector128.Zero); + Vector128 src1_16b = Vector128_.UnpackLow(srcLow1.AsByte(), Vector128.Zero); + Vector128 ref0_16b = Vector128_.UnpackLow(refLow0.AsByte(), Vector128.Zero); + Vector128 ref1_16b = Vector128_.UnpackLow(refLow1.AsByte(), Vector128.Zero); // Compute the difference. - Vector128 row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16()); - Vector128 row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16()); + Vector128 row01 = src0_16b.AsInt16() - ref0_16b.AsInt16(); + Vector128 row23 = src1_16b.AsInt16() - ref1_16b.AsInt16(); // First pass. - FTransformPass1SSE2(row01, row23, out Vector128 v01, out Vector128 v32); + FTransformPass1Vector128(row01, row23, out Vector128 v01, out Vector128 v32); // Second pass. - FTransformPass2SSE2(v01, v32, output); + FTransformPass2Vector128(v01, v32, output); } else { @@ -517,88 +510,88 @@ internal static unsafe class Vp8Encoding } } - public static void FTransformPass1SSE2(Vector128 row01, Vector128 row23, out Vector128 out01, out Vector128 out32) + public static void FTransformPass1Vector128(Vector128 row01, Vector128 row23, out Vector128 out01, out Vector128 out32) { // *in01 = 00 01 10 11 02 03 12 13 // *in23 = 20 21 30 31 22 23 32 33 - Vector128 shuf01_p = Sse2.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301); - Vector128 shuf32_p = Sse2.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301); + Vector128 shuf01_p = Vector128_.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301); + Vector128 shuf32_p = Vector128_.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301); // 00 01 10 11 03 02 13 12 // 20 21 30 31 23 22 33 32 - Vector128 s01 = Sse2.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64()); - Vector128 s32 = Sse2.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64()); + Vector128 s01 = Vector128_.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64()); + Vector128 s32 = Vector128_.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64()); // 00 01 10 11 20 21 30 31 // 03 02 13 12 23 22 33 32 - Vector128 a01 = Sse2.Add(s01.AsInt16(), s32.AsInt16()); - Vector128 a32 = Sse2.Subtract(s01.AsInt16(), s32.AsInt16()); + Vector128 a01 = s01.AsInt16() + s32.AsInt16(); + Vector128 a32 = s01.AsInt16() - s32.AsInt16(); // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] // [ (a0 + a1) << 3, ... ] - Vector128 tmp0 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p + Vector128 tmp0 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p // [ (a0 - a1) << 3, ... ] - Vector128 tmp2 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16()); // K88m - Vector128 tmp11 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16()); // K5352_2217p - Vector128 tmp31 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16()); // K5352_2217m - Vector128 tmp12 = Sse2.Add(tmp11, Vector128.Create(1812)); - Vector128 tmp32 = Sse2.Add(tmp31, Vector128.Create(937)); - Vector128 tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9); - Vector128 tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9); - Vector128 s03 = Sse2.PackSignedSaturate(tmp0, tmp2); - Vector128 s12 = Sse2.PackSignedSaturate(tmp1, tmp3); - Vector128 slo = Sse2.UnpackLow(s03, s12); // 0 1 0 1 0 1... - Vector128 shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3 - Vector128 v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32()); - out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32()); - out32 = Sse2.Shuffle(v23, SimdUtils.Shuffle.MMShuffle1032); + Vector128 tmp2 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16()); // K88m + Vector128 tmp11 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16()); // K5352_2217p + Vector128 tmp31 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16()); // K5352_2217m + Vector128 tmp12 = tmp11 + Vector128.Create(1812); + Vector128 tmp32 = tmp31 + Vector128.Create(937); + Vector128 tmp1 = Vector128.ShiftRightArithmetic(tmp12, 9); + Vector128 tmp3 = Vector128.ShiftRightArithmetic(tmp32, 9); + Vector128 s03 = Vector128_.PackSignedSaturate(tmp0, tmp2); + Vector128 s12 = Vector128_.PackSignedSaturate(tmp1, tmp3); + Vector128 slo = Vector128_.UnpackLow(s03, s12); // 0 1 0 1 0 1... + Vector128 shi = Vector128_.UnpackHigh(s03, s12); // 2 3 2 3 2 3 + Vector128 v23 = Vector128_.UnpackHigh(slo.AsInt32(), shi.AsInt32()); + out01 = Vector128_.UnpackLow(slo.AsInt32(), shi.AsInt32()); + out32 = Vector128_.ShuffleNative(v23, SimdUtils.Shuffle.MMShuffle1032); } - public static void FTransformPass2SSE2(Vector128 v01, Vector128 v32, Span output) + public static void FTransformPass2Vector128(Vector128 v01, Vector128 v32, Span output) { // Same operations are done on the (0,3) and (1,2) pairs. // a3 = v0 - v3 // a2 = v1 - v2 - Vector128 a32 = Sse2.Subtract(v01.AsInt16(), v32.AsInt16()); - Vector128 a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64()); + Vector128 a32 = v01.AsInt16() - v32.AsInt16(); + Vector128 a22 = Vector128_.UnpackHigh(a32.AsInt64(), a32.AsInt64()); - Vector128 b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16()); - Vector128 c1 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16()); // K5352_2217 - Vector128 c3 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16()); // K2217_5352 - Vector128 d1 = Sse2.Add(c1, Vector128.Create(12000 + (1 << 16))); // K12000PlusOne - Vector128 d3 = Sse2.Add(c3, Vector128.Create(51000)); - Vector128 e1 = Sse2.ShiftRightArithmetic(d1, 16); - Vector128 e3 = Sse2.ShiftRightArithmetic(d3, 16); + Vector128 b23 = Vector128_.UnpackLow(a22.AsInt16(), a32.AsInt16()); + Vector128 c1 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16()); // K5352_2217 + Vector128 c3 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16()); // K2217_5352 + Vector128 d1 = c1 + Vector128.Create(12000 + (1 << 16)); // K12000PlusOne + Vector128 d3 = c3 + Vector128.Create(51000); + Vector128 e1 = Vector128.ShiftRightArithmetic(d1, 16); + Vector128 e3 = Vector128.ShiftRightArithmetic(d3, 16); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) - Vector128 f1 = Sse2.PackSignedSaturate(e1, e1); - Vector128 f3 = Sse2.PackSignedSaturate(e3, e3); + Vector128 f1 = Vector128_.PackSignedSaturate(e1, e1); + Vector128 f3 = Vector128_.PackSignedSaturate(e3, e3); // g1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. // -> g1 = f1 + 1 - (a3 == 0) - Vector128 g1 = Sse2.Add(f1, Sse2.CompareEqual(a32, Vector128.Zero)); + Vector128 g1 = f1 + Vector128.Equals(a32, Vector128.Zero); // a0 = v0 + v3 // a1 = v1 + v2 - Vector128 a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16()); - Vector128 a01Plus7 = Sse2.Add(a01.AsInt16(), Vector128.Create((short)7)); - Vector128 a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16(); - Vector128 c0 = Sse2.Add(a01Plus7, a11); - Vector128 c2 = Sse2.Subtract(a01Plus7, a11); + Vector128 a01 = v01.AsInt16() + v32.AsInt16(); + Vector128 a01Plus7 = a01.AsInt16() + Vector128.Create((short)7); + Vector128 a11 = Vector128_.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16(); + Vector128 c0 = a01Plus7 + a11; + Vector128 c2 = a01Plus7 - a11; // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; - Vector128 d0 = Sse2.ShiftRightArithmetic(c0, 4); - Vector128 d2 = Sse2.ShiftRightArithmetic(c2, 4); + Vector128 d0 = Vector128.ShiftRightArithmetic(c0, 4); + Vector128 d2 = Vector128.ShiftRightArithmetic(c2, 4); - Vector128 d0g1 = Sse2.UnpackLow(d0.AsInt64(), g1.AsInt64()); - Vector128 d2f3 = Sse2.UnpackLow(d2.AsInt64(), f3.AsInt64()); + Vector128 d0g1 = Vector128_.UnpackLow(d0.AsInt64(), g1.AsInt64()); + Vector128 d2f3 = Vector128_.UnpackLow(d2.AsInt64(), f3.AsInt64()); ref short outputRef = ref MemoryMarshal.GetReference(output); Unsafe.As>(ref outputRef) = d0g1.AsInt16(); diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 40146c6af..d5f91b7c8 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -5,7 +5,7 @@ using System.Buffers; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -29,9 +29,9 @@ internal static class YuvConversion // ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16 public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { - if (Sse41.IsSupported) + if (Vector128.IsHardwareAccelerated) { - UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer); + UpSampleVector128(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer); } else { @@ -107,7 +107,7 @@ internal static class YuvConversion // // Then m can be written as // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1 - private static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) + private static void UpSampleVector128(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { const int xStep = 3; Array.Clear(uvBuffer); @@ -138,18 +138,18 @@ internal static class YuvConversion { for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) { - UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru); - UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv); - ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); + UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru); + UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv); + ConvertYuvToBgrWithBottomYVector128(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); } } else { for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) { - UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru); - UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv); - ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep); + UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru); + UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv); + ConvertYuvToBgrVector128(topY, topDst, ru, rv, pos, xStep); } } @@ -161,18 +161,18 @@ internal static class YuvConversion Span tmpBottomDst = tmpTopDst[(4 * 32)..]; Span tmpTop = tmpBottomDst[(4 * 32)..]; Span tmpBottom = bottomY.IsEmpty ? null : tmpTop[32..]; - UpSampleLastBlock(topU[uvPos..], curU[uvPos..], leftOver, ru); - UpSampleLastBlock(topV[uvPos..], curV[uvPos..], leftOver, rv); + UpSampleLastBlockVector128(topU[uvPos..], curU[uvPos..], leftOver, ru); + UpSampleLastBlockVector128(topV[uvPos..], curV[uvPos..], leftOver, rv); topY[pos..len].CopyTo(tmpTop); if (!bottomY.IsEmpty) { bottomY[pos..len].CopyTo(tmpBottom); - ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); + ConvertYuvToBgrWithBottomYVector128(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); } else { - ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep); + ConvertYuvToBgrVector128(tmpTop, tmpTopDst, ru, rv, 0, xStep); } tmpTopDst[..((len - pos) * xStep)].CopyTo(topDst[(pos * xStep)..]); @@ -184,7 +184,7 @@ internal static class YuvConversion } // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. - private static void UpSample32Pixels(ref byte r1, ref byte r2, Span output) + private static void UpSample32PixelsVector128(ref byte r1, ref byte r2, Span output) { // Load inputs. Vector128 a = Unsafe.As>(ref r1); @@ -192,28 +192,28 @@ internal static class YuvConversion Vector128 c = Unsafe.As>(ref r2); Vector128 d = Unsafe.As>(ref Unsafe.Add(ref r2, 1)); - Vector128 s = Sse2.Average(a, d); // s = (a + d + 1) / 2 - Vector128 t = Sse2.Average(b, c); // t = (b + c + 1) / 2 - Vector128 st = Sse2.Xor(s, t); // st = s^t + Vector128 s = Vector128_.Average(a, d); // s = (a + d + 1) / 2 + Vector128 t = Vector128_.Average(b, c); // t = (b + c + 1) / 2 + Vector128 st = s ^ t; // st = s^t - Vector128 ad = Sse2.Xor(a, d); // ad = a^d - Vector128 bc = Sse2.Xor(b, c); // bc = b^c + Vector128 ad = a ^ d; // ad = a^d + Vector128 bc = b ^ c; // bc = b^c - Vector128 t1 = Sse2.Or(ad, bc); // (a^d) | (b^c) - Vector128 t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t) - Vector128 t3 = Sse2.And(t2, Vector128.Create((byte)1)); // (a^d) | (b^c) | (s^t) & 1 - Vector128 t4 = Sse2.Average(s, t); - Vector128 k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4 + Vector128 t1 = ad | bc; // (a^d) | (b^c) + Vector128 t2 = t1 | st; // (a^d) | (b^c) | (s^t) + Vector128 t3 = t2 & Vector128.Create((byte)1); // (a^d) | (b^c) | (s^t) & 1 + Vector128 t4 = Vector128_.Average(s, t); + Vector128 k = t4 - t3; // k = (a + b + c + d) / 4 - Vector128 diag1 = GetM(k, st, bc, t); - Vector128 diag2 = GetM(k, st, ad, s); + Vector128 diag1 = GetMVector128(k, st, bc, t); + Vector128 diag2 = GetMVector128(k, st, ad, s); // Pack the alternate pixels. - PackAndStore(a, b, diag1, diag2, output); // store top. - PackAndStore(c, d, diag2, diag1, output[(2 * 32)..]); + PackAndStoreVector128(a, b, diag1, diag2, output); // store top. + PackAndStoreVector128(c, d, diag2, diag1, output[(2 * 32)..]); } - private static void UpSampleLastBlock(Span tb, Span bb, int numPixels, Span output) + private static void UpSampleLastBlockVector128(Span tb, Span bb, int numPixels, Span output) { Span r1 = stackalloc byte[17]; Span r2 = stackalloc byte[17]; @@ -230,27 +230,27 @@ internal static class YuvConversion ref byte r1Ref = ref MemoryMarshal.GetReference(r1); ref byte r2Ref = ref MemoryMarshal.GetReference(r2); - UpSample32Pixels(ref r1Ref, ref r2Ref, output); + UpSample32PixelsVector128(ref r1Ref, ref r2Ref, output); } // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 - private static Vector128 GetM(Vector128 k, Vector128 st, Vector128 ij, Vector128 input) + private static Vector128 GetMVector128(Vector128 k, Vector128 st, Vector128 ij, Vector128 input) { - Vector128 tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2 - Vector128 tmp1 = Sse2.And(ij, st); // (ij) & (s^t) - Vector128 tmp2 = Sse2.Xor(k, input); // (k^in) - Vector128 tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in) - Vector128 tmp4 = Sse2.And(tmp3, Vector128.Create((byte)1)); // & 1 -> lsb_correction + Vector128 tmp0 = Vector128_.Average(k, input); // (k + in + 1) / 2 + Vector128 tmp1 = ij & st; // (ij) & (s^t) + Vector128 tmp2 = k ^ input; // (k^in) + Vector128 tmp3 = tmp1 | tmp2; // ((ij) & (s^t)) | (k^in) + Vector128 tmp4 = tmp3 & Vector128.Create((byte)1); // & 1 -> lsb_correction - return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction + return tmp0 - tmp4; // (k + in + 1) / 2 - lsb_correction } - private static void PackAndStore(Vector128 a, Vector128 b, Vector128 da, Vector128 db, Span output) + private static void PackAndStoreVector128(Vector128 a, Vector128 b, Vector128 da, Vector128 db, Span output) { - Vector128 ta = Sse2.Average(a, da); // (9a + 3b + 3c + d + 8) / 16 - Vector128 tb = Sse2.Average(b, db); // (3a + 9b + c + 3d + 8) / 16 - Vector128 t1 = Sse2.UnpackLow(ta, tb); - Vector128 t2 = Sse2.UnpackHigh(ta, tb); + Vector128 ta = Vector128_.Average(a, da); // (9a + 3b + 3c + d + 8) / 16 + Vector128 tb = Vector128_.Average(b, db); // (3a + 9b + c + 3d + 8) / 16 + Vector128 t1 = Vector128_.UnpackLow(ta, tb); + Vector128 t2 = Vector128_.UnpackHigh(ta, tb); ref byte output0Ref = ref MemoryMarshal.GetReference(output); ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16); @@ -562,41 +562,42 @@ internal static class YuvConversion } [MethodImpl(InliningOptions.ShortMethod)] - private static void ConvertYuvToBgrSse41(Span topY, Span topDst, Span ru, Span rv, int curX, int step) => YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]); + private static void ConvertYuvToBgrVector128(Span topY, Span topDst, Span ru, Span rv, int curX, int step) + => YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]); [MethodImpl(InliningOptions.ShortMethod)] - private static void ConvertYuvToBgrWithBottomYSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) + private static void ConvertYuvToBgrWithBottomYVector128(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) { - YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]); - YuvToBgrSse41(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]); + YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]); + YuvToBgrVector128(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]); } - private static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) + private static void YuvToBgrVector128(Span y, Span u, Span v, Span dst) { ref byte yRef = ref MemoryMarshal.GetReference(y); ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128 r0, out Vector128 g0, out Vector128 b0); - ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128 r1, out Vector128 g1, out Vector128 b1); - ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128 r2, out Vector128 g2, out Vector128 b2); - ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128 r3, out Vector128 g3, out Vector128 b3); + ConvertYuv444ToBgrVector128(ref yRef, ref uRef, ref vRef, out Vector128 r0, out Vector128 g0, out Vector128 b0); + ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128 r1, out Vector128 g1, out Vector128 b1); + ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128 r2, out Vector128 g2, out Vector128 b2); + ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128 r3, out Vector128 g3, out Vector128 b3); // Cast to 8b and store as BBBBGGGGRRRR. - Vector128 bgr0 = Sse2.PackUnsignedSaturate(b0, b1); - Vector128 bgr1 = Sse2.PackUnsignedSaturate(b2, b3); - Vector128 bgr2 = Sse2.PackUnsignedSaturate(g0, g1); - Vector128 bgr3 = Sse2.PackUnsignedSaturate(g2, g3); - Vector128 bgr4 = Sse2.PackUnsignedSaturate(r0, r1); - Vector128 bgr5 = Sse2.PackUnsignedSaturate(r2, r3); + Vector128 bgr0 = Vector128_.PackUnsignedSaturate(b0, b1); + Vector128 bgr1 = Vector128_.PackUnsignedSaturate(b2, b3); + Vector128 bgr2 = Vector128_.PackUnsignedSaturate(g0, g1); + Vector128 bgr3 = Vector128_.PackUnsignedSaturate(g2, g3); + Vector128 bgr4 = Vector128_.PackUnsignedSaturate(r0, r1); + Vector128 bgr5 = Vector128_.PackUnsignedSaturate(r2, r3); // Pack as BGRBGRBGRBGR. - PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst); + PlanarTo24bVector128(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst); } // Pack the planar buffers // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... - private static void PlanarTo24bSse41(Vector128 input0, Vector128 input1, Vector128 input2, Vector128 input3, Vector128 input4, Vector128 input5, Span rgb) + private static void PlanarTo24bVector128(Vector128 input0, Vector128 input1, Vector128 input2, Vector128 input3, Vector128 input4, Vector128 input5, Span rgb) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. @@ -612,7 +613,7 @@ internal static class YuvConversion // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 // Process R. - ChannelMixing( + ChannelMixingVector128( input0, input1, Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5), // PlanarTo24Shuffle0 @@ -627,7 +628,7 @@ internal static class YuvConversion // Process G. // Same as before, just shifted to the left by one and including the right padding. - ChannelMixing( + ChannelMixingVector128( input2, input3, Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255), // PlanarTo24Shuffle3 @@ -641,7 +642,7 @@ internal static class YuvConversion out Vector128 g5); // Process B. - ChannelMixing( + ChannelMixingVector128( input4, input5, Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255), // PlanarTo24Shuffle6 @@ -655,24 +656,24 @@ internal static class YuvConversion out Vector128 b5); // OR the different channels. - Vector128 rg0 = Sse2.Or(r0, g0); - Vector128 rg1 = Sse2.Or(r1, g1); - Vector128 rg2 = Sse2.Or(r2, g2); - Vector128 rg3 = Sse2.Or(r3, g3); - Vector128 rg4 = Sse2.Or(r4, g4); - Vector128 rg5 = Sse2.Or(r5, g5); + Vector128 rg0 = r0 | g0; + Vector128 rg1 = r1 | g1; + Vector128 rg2 = r2 | g2; + Vector128 rg3 = r3 | g3; + Vector128 rg4 = r4 | g4; + Vector128 rg5 = r5 | g5; ref byte outputRef = ref MemoryMarshal.GetReference(rgb); - Unsafe.As>(ref outputRef) = Sse2.Or(rg0, b0); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5); + Unsafe.As>(ref outputRef) = rg0 | b0; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 16)) = rg1 | b1; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 32)) = rg2 | b2; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 48)) = rg3 | b3; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 64)) = rg4 | b4; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 80)) = rg5 | b5; } // Shuffles the input buffer as A0 0 0 A1 0 0 A2 - private static void ChannelMixing( + private static void ChannelMixingVector128( Vector128 input0, Vector128 input1, Vector128 shuffle0, @@ -685,53 +686,53 @@ internal static class YuvConversion out Vector128 output4, out Vector128 output5) { - output0 = Ssse3.Shuffle(input0, shuffle0); - output1 = Ssse3.Shuffle(input0, shuffle1); - output2 = Ssse3.Shuffle(input0, shuffle2); - output3 = Ssse3.Shuffle(input1, shuffle0); - output4 = Ssse3.Shuffle(input1, shuffle1); - output5 = Ssse3.Shuffle(input1, shuffle2); + output0 = Vector128_.ShuffleNative(input0, shuffle0); + output1 = Vector128_.ShuffleNative(input0, shuffle1); + output2 = Vector128_.ShuffleNative(input0, shuffle2); + output3 = Vector128_.ShuffleNative(input1, shuffle0); + output4 = Vector128_.ShuffleNative(input1, shuffle1); + output5 = Vector128_.ShuffleNative(input1, shuffle2); } // Convert 32 samples of YUV444 to B/G/R - private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128 r, out Vector128 g, out Vector128 b) + private static void ConvertYuv444ToBgrVector128(ref byte y, ref byte u, ref byte v, out Vector128 r, out Vector128 g, out Vector128 b) { // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. Vector128 y0 = Unsafe.As>(ref y); Vector128 u0 = Unsafe.As>(ref u); Vector128 v0 = Unsafe.As>(ref v); - y0 = Sse2.UnpackLow(Vector128.Zero, y0); - u0 = Sse2.UnpackLow(Vector128.Zero, u0); - v0 = Sse2.UnpackLow(Vector128.Zero, v0); + y0 = Vector128_.UnpackLow(Vector128.Zero, y0); + u0 = Vector128_.UnpackLow(Vector128.Zero, u0); + v0 = Vector128_.UnpackLow(Vector128.Zero, v0); // These constants are 14b fixed-point version of ITU-R BT.601 constants. // R = (19077 * y + 26149 * v - 14234) >> 6 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 // B = (19077 * y + 33050 * u - 17685) >> 6 - var k19077 = Vector128.Create((ushort)19077); - var k26149 = Vector128.Create((ushort)26149); - var k14234 = Vector128.Create((ushort)14234); + Vector128 k19077 = Vector128.Create((ushort)19077); + Vector128 k26149 = Vector128.Create((ushort)26149); + Vector128 k14234 = Vector128.Create((ushort)14234); - Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), k19077); - Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), k26149); - Vector128 g0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419)); - Vector128 g1 = Sse2.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320)); + Vector128 y1 = Vector128_.MultiplyHigh(y0.AsUInt16(), k19077); + Vector128 r0 = Vector128_.MultiplyHigh(v0.AsUInt16(), k26149); + Vector128 g0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419)); + Vector128 g1 = Vector128_.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320)); - Vector128 r1 = Sse2.Subtract(y1.AsUInt16(), k14234); - Vector128 r2 = Sse2.Add(r1, r0); + Vector128 r1 = y1.AsUInt16() - k14234; + Vector128 r2 = r1 + r0; - Vector128 g2 = Sse2.Add(y1.AsUInt16(), Vector128.Create((ushort)8708)); - Vector128 g3 = Sse2.Add(g0, g1); - Vector128 g4 = Sse2.Subtract(g2, g3); + Vector128 g2 = y1.AsUInt16() + Vector128.Create((ushort)8708); + Vector128 g3 = g0 + g1; + Vector128 g4 = g2 - g3; - Vector128 b0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16()); - Vector128 b1 = Sse2.AddSaturate(b0, y1); - Vector128 b2 = Sse2.SubtractSaturate(b1, Vector128.Create((ushort)17685)); + Vector128 b0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16()); + Vector128 b1 = Vector128_.AddSaturate(b0, y1); + Vector128 b2 = Vector128_.SubtractSaturate(b1, Vector128.Create((ushort)17685)); // Use logical shift for B2, which can be larger than 32767. - r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815] - g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710] - b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] + r = Vector128.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815] + g = Vector128.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710] + b = Vector128.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] } [MethodImpl(InliningOptions.ShortMethod)] diff --git a/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs b/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs index 1ca409f9a..acfa26b4f 100644 --- a/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs +++ b/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs @@ -3,7 +3,7 @@ using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Formats.Webp; @@ -20,7 +20,7 @@ internal static class WebpCommonUtils /// Returns true if alpha has non-0xff values. public static unsafe bool CheckNonOpaque(ReadOnlySpan row) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { ReadOnlySpan rowBytes = MemoryMarshal.AsBytes(row); int i = 0; @@ -32,20 +32,20 @@ internal static class WebpCommonUtils for (; i + 128 <= length; i += 128) { - Vector256 a0 = Avx.LoadVector256(src + i).AsByte(); - Vector256 a1 = Avx.LoadVector256(src + i + 32).AsByte(); - Vector256 a2 = Avx.LoadVector256(src + i + 64).AsByte(); - Vector256 a3 = Avx.LoadVector256(src + i + 96).AsByte(); - Vector256 b0 = Avx2.And(a0, alphaMaskVector256).AsInt32(); - Vector256 b1 = Avx2.And(a1, alphaMaskVector256).AsInt32(); - Vector256 b2 = Avx2.And(a2, alphaMaskVector256).AsInt32(); - Vector256 b3 = Avx2.And(a3, alphaMaskVector256).AsInt32(); - Vector256 c0 = Avx2.PackSignedSaturate(b0, b1).AsInt16(); - Vector256 c1 = Avx2.PackSignedSaturate(b2, b3).AsInt16(); - Vector256 d = Avx2.PackSignedSaturate(c0, c1).AsByte(); - Vector256 bits = Avx2.CompareEqual(d, all0x80Vector256); - int mask = Avx2.MoveMask(bits); - if (mask != -1) + Vector256 a0 = Vector256.Load(src + i).AsByte(); + Vector256 a1 = Vector256.Load(src + i + 32).AsByte(); + Vector256 a2 = Vector256.Load(src + i + 64).AsByte(); + Vector256 a3 = Vector256.Load(src + i + 96).AsByte(); + Vector256 b0 = (a0 & alphaMaskVector256).AsInt32(); + Vector256 b1 = (a1 & alphaMaskVector256).AsInt32(); + Vector256 b2 = (a2 & alphaMaskVector256).AsInt32(); + Vector256 b3 = (a3 & alphaMaskVector256).AsInt32(); + Vector256 c0 = Vector256_.PackSignedSaturate(b0, b1).AsInt16(); + Vector256 c1 = Vector256_.PackSignedSaturate(b2, b3).AsInt16(); + Vector256 d = Vector256_.PackSignedSaturate(c0, c1).AsByte(); + Vector256 bits = Vector256.Equals(d, all0x80Vector256); + uint mask = bits.ExtractMostSignificantBits(); + if (mask != 0xFFFF_FFFF) { return true; } @@ -53,7 +53,7 @@ internal static class WebpCommonUtils for (; i + 64 <= length; i += 64) { - if (IsNoneOpaque64Bytes(src, i)) + if (IsNoneOpaque64BytesVector128(src, i)) { return true; } @@ -61,7 +61,7 @@ internal static class WebpCommonUtils for (; i + 32 <= length; i += 32) { - if (IsNoneOpaque32Bytes(src, i)) + if (IsNonOpaque32BytesVector128(src, i)) { return true; } @@ -76,7 +76,7 @@ internal static class WebpCommonUtils } } } - else if (Sse2.IsSupported) + else if (Vector128.IsHardwareAccelerated) { ReadOnlySpan rowBytes = MemoryMarshal.AsBytes(row); int i = 0; @@ -85,7 +85,7 @@ internal static class WebpCommonUtils { for (; i + 64 <= length; i += 64) { - if (IsNoneOpaque64Bytes(src, i)) + if (IsNoneOpaque64BytesVector128(src, i)) { return true; } @@ -93,7 +93,7 @@ internal static class WebpCommonUtils for (; i + 32 <= length; i += 32) { - if (IsNoneOpaque32Bytes(src, i)) + if (IsNonOpaque32BytesVector128(src, i)) { return true; } @@ -122,38 +122,38 @@ internal static class WebpCommonUtils return false; } - private static unsafe bool IsNoneOpaque64Bytes(byte* src, int i) + private static unsafe bool IsNoneOpaque64BytesVector128(byte* src, int i) { Vector128 alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255); - Vector128 a0 = Sse2.LoadVector128(src + i).AsByte(); - Vector128 a1 = Sse2.LoadVector128(src + i + 16).AsByte(); - Vector128 a2 = Sse2.LoadVector128(src + i + 32).AsByte(); - Vector128 a3 = Sse2.LoadVector128(src + i + 48).AsByte(); - Vector128 b0 = Sse2.And(a0, alphaMask).AsInt32(); - Vector128 b1 = Sse2.And(a1, alphaMask).AsInt32(); - Vector128 b2 = Sse2.And(a2, alphaMask).AsInt32(); - Vector128 b3 = Sse2.And(a3, alphaMask).AsInt32(); - Vector128 c0 = Sse2.PackSignedSaturate(b0, b1).AsInt16(); - Vector128 c1 = Sse2.PackSignedSaturate(b2, b3).AsInt16(); - Vector128 d = Sse2.PackSignedSaturate(c0, c1).AsByte(); - Vector128 bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte()); - int mask = Sse2.MoveMask(bits); + Vector128 a0 = Vector128.Load(src + i).AsByte(); + Vector128 a1 = Vector128.Load(src + i + 16).AsByte(); + Vector128 a2 = Vector128.Load(src + i + 32).AsByte(); + Vector128 a3 = Vector128.Load(src + i + 48).AsByte(); + Vector128 b0 = (a0 & alphaMask).AsInt32(); + Vector128 b1 = (a1 & alphaMask).AsInt32(); + Vector128 b2 = (a2 & alphaMask).AsInt32(); + Vector128 b3 = (a3 & alphaMask).AsInt32(); + Vector128 c0 = Vector128_.PackSignedSaturate(b0, b1).AsInt16(); + Vector128 c1 = Vector128_.PackSignedSaturate(b2, b3).AsInt16(); + Vector128 d = Vector128_.PackSignedSaturate(c0, c1).AsByte(); + Vector128 bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte()); + uint mask = bits.ExtractMostSignificantBits(); return mask != 0xFFFF; } - private static unsafe bool IsNoneOpaque32Bytes(byte* src, int i) + private static unsafe bool IsNonOpaque32BytesVector128(byte* src, int i) { Vector128 alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255); - Vector128 a0 = Sse2.LoadVector128(src + i).AsByte(); - Vector128 a1 = Sse2.LoadVector128(src + i + 16).AsByte(); - Vector128 b0 = Sse2.And(a0, alphaMask).AsInt32(); - Vector128 b1 = Sse2.And(a1, alphaMask).AsInt32(); - Vector128 c = Sse2.PackSignedSaturate(b0, b1).AsInt16(); - Vector128 d = Sse2.PackSignedSaturate(c, c).AsByte(); - Vector128 bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte()); - int mask = Sse2.MoveMask(bits); + Vector128 a0 = Vector128.Load(src + i).AsByte(); + Vector128 a1 = Vector128.Load(src + i + 16).AsByte(); + Vector128 b0 = (a0 & alphaMask).AsInt32(); + Vector128 b1 = (a1 & alphaMask).AsInt32(); + Vector128 c = Vector128_.PackSignedSaturate(b0, b1).AsInt16(); + Vector128 d = Vector128_.PackSignedSaturate(c, c).AsByte(); + Vector128 bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte()); + uint mask = bits.ExtractMostSignificantBits(); return mask != 0xFFFF; } } diff --git a/src/ImageSharp/Metadata/Profiles/IPTC/IptcProfile.cs b/src/ImageSharp/Metadata/Profiles/IPTC/IptcProfile.cs index 162fae96b..85c23d174 100644 --- a/src/ImageSharp/Metadata/Profiles/IPTC/IptcProfile.cs +++ b/src/ImageSharp/Metadata/Profiles/IPTC/IptcProfile.cs @@ -3,7 +3,6 @@ using System.Buffers.Binary; using System.Collections.ObjectModel; -using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.Text; using SixLabors.ImageSharp.Metadata.Profiles.IPTC; diff --git a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs index c769b389d..ea970a718 100644 --- a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs +++ b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs @@ -210,6 +210,8 @@ public partial class PixelOperations { GuardUnpackIntoRgbPlanes(redChannel, greenChannel, blueChannel, source); + // TODO: This can be much faster. + // Convert to Rgba32 first using pixel operations then use the R, G, B properties. int count = source.Length; ref float r = ref MemoryMarshal.GetReference(redChannel); diff --git a/tests/ImageSharp.Tests/Formats/Tiff/TiffDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Tiff/TiffDecoderTests.cs index 819547c51..cf27e561f 100644 --- a/tests/ImageSharp.Tests/Formats/Tiff/TiffDecoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Tiff/TiffDecoderTests.cs @@ -341,16 +341,46 @@ public class TiffDecoderTests : TiffDecoderBaseTester [Theory] [WithFile(Cmyk, PixelTypes.Rgba32)] [WithFile(CmykLzwPredictor, PixelTypes.Rgba32)] + [WithFile(CmykJpeg, PixelTypes.Rgba32)] public void TiffDecoder_CanDecode_Cmyk(TestImageProvider provider) where TPixel : unmanaged, IPixel { // Note: The image from MagickReferenceDecoder does not look right, maybe we are doing something wrong // converting the pixel data from Magick.NET to our format with CMYK? - using Image image = provider.GetImage(); + using Image image = provider.GetImage(TiffDecoder.Instance); image.DebugSave(provider); image.CompareToReferenceOutput(ImageComparer.Exact, provider); } + [Theory] + [WithFile(Issues2454_A, PixelTypes.Rgba32)] + [WithFile(Issues2454_B, PixelTypes.Rgba32)] + public void TiffDecoder_CanDecode_YccK(TestImageProvider provider) + where TPixel : unmanaged, IPixel + { + using Image image = provider.GetImage(TiffDecoder.Instance); + image.DebugSave(provider); + image.CompareToReferenceOutput(ImageComparer.Exact, provider); + } + + [Theory] + [WithFile(Issues2454_A, PixelTypes.Rgba32)] + [WithFile(Issues2454_B, PixelTypes.Rgba32)] + public void TiffDecoder_CanDecode_YccK_ICC(TestImageProvider provider) + where TPixel : unmanaged, IPixel + { + DecoderOptions options = new() + { + ColorProfileHandling = ColorProfileHandling.Convert, + }; + + using Image image = provider.GetImage(TiffDecoder.Instance, options); + image.DebugSave(provider); + + // Linux reports a 0.0000% difference, so we use a tolerant comparer here. + image.CompareToReferenceOutput(ImageComparer.TolerantPercentage(0.0001F), provider); + } + [Theory] [WithFile(FlowerRgb101010Contiguous, PixelTypes.Rgba32)] [WithFile(FlowerRgb101010Planar, PixelTypes.Rgba32)] diff --git a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs index c5e8c975f..6073888fe 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs @@ -71,17 +71,17 @@ public class ColorSpaceTransformUtilsTests public void CollectColorBlueTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.AllowAll); [Fact] - public void CollectColorBlueTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41); + public void CollectColorBlueTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41); [Fact] - public void CollectColorBlueTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2); + public void CollectColorBlueTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2); [Fact] public void CollectColorRedTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.AllowAll); [Fact] - public void CollectColorRedTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41); + public void CollectColorRedTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41); [Fact] - public void CollectColorRedTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2); + public void CollectColorRedTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2); } diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs index 8b5529ac1..0908e6e6b 100644 --- a/tests/ImageSharp.Tests/TestImages.cs +++ b/tests/ImageSharp.Tests/TestImages.cs @@ -1122,6 +1122,7 @@ public static class TestImages public const string Cmyk = "Tiff/Cmyk.tiff"; public const string Cmyk64BitDeflate = "Tiff/cmyk_deflate_64bit.tiff"; public const string CmykLzwPredictor = "Tiff/Cmyk-lzw-predictor.tiff"; + public const string CmykJpeg = "Tiff/Cmyk-jpeg.tiff"; public const string Issues1716Rgb161616BitLittleEndian = "Tiff/Issues/Issue1716.tiff"; public const string Issues1891 = "Tiff/Issues/Issue1891.tiff"; @@ -1129,6 +1130,8 @@ public static class TestImages public const string Issues2149 = "Tiff/Issues/Group4CompressionWithStrips.tiff"; public const string Issues2255 = "Tiff/Issues/Issue2255.png"; public const string Issues2435 = "Tiff/Issues/Issue2435.tiff"; + public const string Issues2454_A = "Tiff/Issues/Issue2454_A.tif"; + public const string Issues2454_B = "Tiff/Issues/Issue2454_B.tif"; public const string Issues2587 = "Tiff/Issues/Issue2587.tiff"; public const string Issues2679 = "Tiff/Issues/Issue2679.tiff"; public const string JpegCompressedGray0000539558 = "Tiff/Issues/JpegCompressedGray-0000539558.tiff"; diff --git a/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_Cmyk_Rgba32_Cmyk-jpeg.png b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_Cmyk_Rgba32_Cmyk-jpeg.png new file mode 100644 index 000000000..06d60e030 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_Cmyk_Rgba32_Cmyk-jpeg.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f68db78d765a7f36570cd7b57a1f06cfca24c3b4916d0692a4aa051209ec327 +size 616 diff --git a/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_A.png b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_A.png new file mode 100644 index 000000000..97118c15b --- /dev/null +++ b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_A.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4f77673028643af0ac02a8f6a1e2db14052177e3401c369391a8ff7e943770c +size 7679254 diff --git a/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_B.png b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_B.png new file mode 100644 index 000000000..52accc22d --- /dev/null +++ b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_B.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e616895c21fd8b19a216e8a3ef4968bd413589b5875efdac29860f019a710527 +size 7517284 diff --git a/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_A.png b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_A.png new file mode 100644 index 000000000..350d1af68 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_A.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7911e059049c427229136479740fd62e2e09907549ec3e1421a6a60da6167cc +size 7840892 diff --git a/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_B.png b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_B.png new file mode 100644 index 000000000..3dc99e604 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_B.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:291f2033a7b4cfc10fb3301283c167b3fbc288bc173c95b21bc726bf076865af +size 7649213 diff --git a/tests/Images/Input/Tiff/Cmyk-jpeg.tiff b/tests/Images/Input/Tiff/Cmyk-jpeg.tiff new file mode 100644 index 000000000..e486403e4 --- /dev/null +++ b/tests/Images/Input/Tiff/Cmyk-jpeg.tiff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abb923e457acc31a7f18c46a7d58fc5a42f5c3d197236403921e3ee623fa4fac +size 2046 diff --git a/tests/Images/Input/Tiff/Cmyk-planar-jpg.tiff b/tests/Images/Input/Tiff/Cmyk-planar-jpg.tiff new file mode 100644 index 000000000..e486403e4 --- /dev/null +++ b/tests/Images/Input/Tiff/Cmyk-planar-jpg.tiff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abb923e457acc31a7f18c46a7d58fc5a42f5c3d197236403921e3ee623fa4fac +size 2046 diff --git a/tests/Images/Input/Tiff/Issues/Issue2454_A.tif b/tests/Images/Input/Tiff/Issues/Issue2454_A.tif new file mode 100644 index 000000000..99e13be55 --- /dev/null +++ b/tests/Images/Input/Tiff/Issues/Issue2454_A.tif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:868fbf7fc7a61bc6b1226160c8dc3bb1faebd8d4a2a6fe9494962f3fbe3a7fdc +size 5024256 diff --git a/tests/Images/Input/Tiff/Issues/Issue2454_B.tif b/tests/Images/Input/Tiff/Issues/Issue2454_B.tif new file mode 100644 index 000000000..9c322b765 --- /dev/null +++ b/tests/Images/Input/Tiff/Issues/Issue2454_B.tif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:867851192f540742ba1481f503834f8aa77caa03ac59f8204d098bf940b0bb3a +size 4387646