Browse Source

Merge pull request #2933 from SixLabors/js/webp-arm

Add ARM support to WEBP Utilities
pull/2941/head
James Jackson-South 1 year ago
committed by GitHub
parent
commit
166a846b6d
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 17
      src/ImageSharp/Common/Helpers/Numerics.cs
  2. 57
      src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
  3. 1092
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  4. 315
      src/ImageSharp/Common/Helpers/Vector256Utilities.cs
  5. 96
      src/ImageSharp/Common/Helpers/Vector512Utilities.cs
  6. 7
      src/ImageSharp/Formats/Webp/AlphaDecoder.cs
  7. 145
      src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
  8. 196
      src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
  9. 1273
      src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
  10. 355
      src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
  11. 209
      src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
  12. 88
      src/ImageSharp/Formats/Webp/WebpCommonUtils.cs
  13. 8
      tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs

17
src/ImageSharp/Common/Helpers/Numerics.cs

@ -884,23 +884,6 @@ internal static class Numerics
accumulator += intHigh;
}
/// <summary>
/// Reduces elements of the vector into one sum.
/// </summary>
/// <param name="accumulator">The accumulator to reduce.</param>
/// <returns>The sum of all elements.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int ReduceSum(Vector128<int> accumulator)
{
// Add odd to even.
Vector128<int> vsum = Sse2.Add(accumulator, Sse2.Shuffle(accumulator, 0b_11_11_01_01));
// Add high to low.
vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10));
return Sse2.ConvertToInt32(vsum);
}
/// <summary>
/// Reduces elements of the vector into one sum.
/// </summary>

57
src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

@ -66,9 +66,9 @@ internal static partial class SimdUtils
ref Span<float> destination,
[ConstantExpected] byte control)
{
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) ||
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) ||
Vector128.IsHardwareAccelerated)
if (Vector512.IsHardwareAccelerated ||
Vector256.IsHardwareAccelerated ||
Vector128.IsHardwareAccelerated)
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@ -112,9 +112,9 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) ||
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) ||
(Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte))
if (Vector512.IsHardwareAccelerated ||
Vector256.IsHardwareAccelerated ||
Vector128.IsHardwareAccelerated)
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@ -158,7 +158,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight)
if (Vector128.IsHardwareAccelerated)
{
int remainder = source.Length % (Vector128<byte>.Count * 3);
@ -190,7 +190,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated)
{
int remainder = source.Length % (Vector128<byte>.Count * 3);
@ -223,7 +223,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated)
{
int remainder = source.Length & ((Vector128<byte>.Count * 4) - 1); // bit-hack for modulo
@ -249,7 +249,7 @@ internal static partial class SimdUtils
Span<float> destination,
[ConstantExpected] byte control)
{
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat)
if (Vector512.IsHardwareAccelerated)
{
ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
@ -277,7 +277,7 @@ internal static partial class SimdUtils
}
}
}
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat)
else if (Vector256.IsHardwareAccelerated)
{
ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@ -341,7 +341,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte)
if (Vector512.IsHardwareAccelerated)
{
Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -373,8 +373,13 @@ internal static partial class SimdUtils
}
}
}
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte)
else if (Vector256.IsHardwareAccelerated)
{
// ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb).
// MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte,
// so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F)
// for indexing within each lane, and ignores the upper bits unless bit 7 is set,
// this usage is guaranteed to remain within-lane and non-zeroing.
Span<byte> temp = stackalloc byte[Vector256<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
Vector256<byte> mask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(temp));
@ -391,21 +396,21 @@ internal static partial class SimdUtils
ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector256_.ShuffleNative(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
vd0 = Vector256_.ShufflePerLane(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
Unsafe.Add(ref destinationBase, i) = Vector256_.ShufflePerLane(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)
else if (Vector128.IsHardwareAccelerated)
{
Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -445,9 +450,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated &&
Vector128_.SupportsShuffleNativeByte &&
Vector128_.SupportsAlignRight)
if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
@ -507,10 +510,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated &&
Vector128_.SupportsShuffleNativeByte &&
Vector128_.SupportsShiftByte &&
Vector128_.SupportsAlignRight)
if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@ -553,10 +553,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated &&
Vector128_.SupportsShuffleNativeByte &&
Vector128_.SupportsShiftByte &&
Vector128_.SupportsAlignRight)
if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);

1092
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

File diff suppressed because it is too large

315
src/ImageSharp/Common/Helpers/Vector256Utilities.cs

@ -1,7 +1,6 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers;
internal static class Vector256_
#pragma warning restore SA1649 // File name should match first type name
{
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
/// </summary>
public static bool SupportsShuffleNativeFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx.IsSupported;
}
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
/// </summary>
public static bool SupportsShuffleNativeByte
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx2.IsSupported;
}
/// <summary>
/// Creates a new vector by selecting values from an input vector using a set of indices.
/// </summary>
@ -47,15 +28,7 @@ internal static class Vector256_
/// <returns>The <see cref="Vector256{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> ShuffleNative(Vector256<float> vector, [ConstantExpected] byte control)
{
if (Avx.IsSupported)
{
return Avx.Shuffle(vector, vector, control);
}
ThrowUnreachableException();
return default;
}
=> Avx.Shuffle(vector, vector, control);
/// <summary>
/// Creates a new vector by selecting values from an input vector using a set of indices.</summary>
@ -66,15 +39,17 @@ internal static class Vector256_
/// </param>
/// <returns>The <see cref="Vector256{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> ShuffleNative(Vector256<byte> vector, Vector256<byte> indices)
public static Vector256<byte> ShufflePerLane(Vector256<byte> vector, Vector256<byte> indices)
{
if (Avx2.IsSupported)
{
return Avx2.Shuffle(vector, indices);
}
ThrowUnreachableException();
return default;
Vector128<byte> indicesLo = indices.GetLower();
Vector128<byte> lower = Vector128_.ShuffleNative(vector.GetLower(), indicesLo);
Vector128<byte> upper = Vector128_.ShuffleNative(vector.GetUpper(), indicesLo);
return Vector256.Create(lower, upper);
}
/// <summary>
@ -162,6 +137,54 @@ internal static class Vector256_
return (vm0 * vm1) - vs;
}
/// <summary>
/// Multiply packed signed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and
/// pack the results.
/// </summary>
/// <param name="left">
/// The first vector containing packed signed 16-bit integers to multiply and add.
/// </param>
/// <param name="right">
/// The second vector containing packed signed 16-bit integers to multiply and add.
/// </param>
/// <returns>
/// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<int> MultiplyAddAdjacent(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.MultiplyAddAdjacent(left, right);
}
return Vector256.Create(
Vector128_.MultiplyAddAdjacent(left.GetLower(), right.GetLower()),
Vector128_.MultiplyAddAdjacent(left.GetUpper(), right.GetUpper()));
}
/// <summary>
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
/// </summary>
/// <param name="left">The left hand source vector.</param>
/// <param name="right">The right hand source vector.</param>
/// <returns>The <see cref="Vector256{UInt16}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<ushort> PackUnsignedSaturate(Vector256<int> left, Vector256<int> right)
{
if (Avx2.IsSupported)
{
return Avx2.PackUnsignedSaturate(left, right);
}
Vector256<int> min = Vector256.Create((int)ushort.MinValue);
Vector256<int> max = Vector256.Create((int)ushort.MaxValue);
Vector256<uint> lefClamped = Clamp(left, min, max).AsUInt32();
Vector256<uint> rightClamped = Clamp(right, min, max).AsUInt32();
return Vector256.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
/// </summary>
@ -183,6 +206,27 @@ internal static class Vector256_
return Vector256.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Packs signed 16-bit integers to signed 8-bit integers and saturates.
/// </summary>
/// <param name="left">The left hand source vector.</param>
/// <param name="right">The right hand source vector.</param>
/// <returns>The <see cref="Vector256{SByte}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<sbyte> PackSignedSaturate(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.PackSignedSaturate(left, right);
}
Vector256<short> min = Vector256.Create((short)sbyte.MinValue);
Vector256<short> max = Vector256.Create((short)sbyte.MaxValue);
Vector256<short> lefClamped = Clamp(left, min, max);
Vector256<short> rightClamped = Clamp(right, min, max);
return Vector256.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Restricts a vector between a minimum and a maximum value.
/// </summary>
@ -210,6 +254,211 @@ internal static class Vector256_
return Vector256.WidenLower(value.ToVector256());
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
/// <summary>
/// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
/// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 16-bit integers to multiply.
/// </param>
/// <param name="right">
/// The second vector containing packed 16-bit integers to multiply.
/// </param>
/// <returns>
/// A vector containing the low 16 bits of the products of the packed 16-bit integers
/// from <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<short> MultiplyLow(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.MultiplyLow(left, right);
}
// Widen each half of the short vectors into two int vectors
(Vector256<int> leftLower, Vector256<int> leftUpper) = Vector256.Widen(left);
(Vector256<int> rightLower, Vector256<int> rightUpper) = Vector256.Widen(right);
// Elementwise multiply: each int lane now holds the full 32-bit product
Vector256<int> prodLo = leftLower * rightLower;
Vector256<int> prodHi = leftUpper * rightUpper;
// Narrow the two int vectors back into one short vector
return Vector256.Narrow(prodLo, prodHi);
}
/// <summary>
/// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
/// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 16-bit integers to multiply.
/// </param>
/// <param name="right">
/// The second vector containing packed 16-bit integers to multiply.
/// </param>
/// <returns>
/// A vector containing the high 16 bits of the products of the packed 16-bit integers
/// from <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<short> MultiplyHigh(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.MultiplyHigh(left, right);
}
// Widen each half of the short vectors into two int vectors
(Vector256<int> leftLower, Vector256<int> leftUpper) = Vector256.Widen(left);
(Vector256<int> rightLower, Vector256<int> rightUpper) = Vector256.Widen(right);
// Elementwise multiply: each int lane now holds the full 32-bit product
Vector256<int> prodLo = leftLower * rightLower;
Vector256<int> prodHi = leftUpper * rightUpper;
// Arithmetic shift right by 16 bits to extract the high word
prodLo >>= 16;
prodHi >>= 16;
// Narrow the two int vectors back into one short vector
return Vector256.Narrow(prodLo, prodHi);
}
/// <summary>
/// Unpack and interleave 32-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 32-bit integers to unpack from the low half.
/// </param>
/// <param name="right">
/// The second vector containing packed 32-bit integers to unpack from the low half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 32-bit integers from the low
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<int> UnpackLow(Vector256<int> left, Vector256<int> right)
{
if (Avx2.IsSupported)
{
return Avx2.UnpackLow(left, right);
}
Vector128<int> lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
Vector128<int> hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
return Vector256.Create(lo, hi);
}
/// <summary>
/// Unpack and interleave 8-bit integers from the high half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 8-bit integers to unpack from the high half.
/// </param>
/// <param name="right">
/// The second vector containing packed 8-bit integers to unpack from the high half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 8-bit integers from the high
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> UnpackHigh(Vector256<byte> left, Vector256<byte> right)
{
if (Avx2.IsSupported)
{
return Avx2.UnpackHigh(left, right);
}
Vector128<byte> lo = Vector128_.UnpackHigh(left.GetLower(), right.GetLower());
Vector128<byte> hi = Vector128_.UnpackHigh(left.GetUpper(), right.GetUpper());
return Vector256.Create(lo, hi);
}
/// <summary>
/// Unpack and interleave 8-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 8-bit integers to unpack from the low half.
/// </param>
/// <param name="right">
/// The second vector containing packed 8-bit integers to unpack from the low half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 8-bit integers from the low
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> UnpackLow(Vector256<byte> left, Vector256<byte> right)
{
if (Avx2.IsSupported)
{
return Avx2.UnpackLow(left, right);
}
Vector128<byte> lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
Vector128<byte> hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
return Vector256.Create(lo, hi);
}
/// <summary>
/// Subtract packed signed 16-bit integers in <paramref name="right"/> from packed signed 16-bit integers
/// in <paramref name="left"/> using saturation, and store the results.
/// </summary>
/// <param name="left">
/// The first vector containing packed signed 16-bit integers to subtract from.
/// </param>
/// <param name="right">
/// The second vector containing packed signed 16-bit integers to subtract.
/// </param>
/// <returns>
/// A vector containing the results of subtracting packed unsigned 16-bit integers
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<short> SubtractSaturate(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.SubtractSaturate(left, right);
}
return Vector256.Create(
Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
}
/// <summary>
/// Subtract packed unsigned 8-bit integers in <paramref name="right"/> from packed unsigned 8-bit integers
/// in <paramref name="left"/> using saturation, and store the results.
/// </summary>
/// <param name="left">
/// The first vector containing packed unsigned 8-bit integers to subtract from.
/// </param>
/// <param name="right">
/// The second vector containing packed unsigned 8-bit integers to subtract.
/// </param>
/// <returns>
/// A vector containing the results of subtracting packed unsigned 8-bit integers
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> SubtractSaturate(Vector256<byte> left, Vector256<byte> right)
{
if (Avx2.IsSupported)
{
return Avx2.SubtractSaturate(left, right);
}
return Vector256.Create(
Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
}
}

96
src/ImageSharp/Common/Helpers/Vector512Utilities.cs

@ -1,7 +1,6 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers;
internal static class Vector512_
#pragma warning restore SA1649 // File name should match first type name
{
/// <summary>
/// Gets a value indicating whether shuffle float operations are supported.
/// </summary>
public static bool SupportsShuffleNativeFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx512F.IsSupported;
}
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
/// </summary>
public static bool SupportsShuffleNativeByte
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx512BW.IsSupported;
}
/// <summary>
/// Creates a new vector by selecting values from an input vector using the control.
/// </summary>
@ -47,15 +28,7 @@ internal static class Vector512_
/// <returns>The <see cref="Vector512{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<float> ShuffleNative(Vector512<float> vector, [ConstantExpected] byte control)
{
if (Avx512F.IsSupported)
{
return Avx512F.Shuffle(vector, vector, control);
}
ThrowUnreachableException();
return default;
}
=> Avx512F.Shuffle(vector, vector, control);
/// <summary>
/// Creates a new vector by selecting values from an input vector using a set of indices.
@ -73,8 +46,7 @@ internal static class Vector512_
return Avx512BW.Shuffle(vector, indices);
}
ThrowUnreachableException();
return default;
return Vector512.Shuffle(vector, indices);
}
/// <summary>
@ -85,25 +57,7 @@ internal static class Vector512_
/// <returns>The <see cref="Vector128{Int32}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
{
if (Avx512F.IsSupported)
{
return Avx512F.ConvertToVector512Int32(vector);
}
if (Avx.IsSupported)
{
Vector256<int> lower = Avx.ConvertToVector256Int32(vector.GetLower());
Vector256<int> upper = Avx.ConvertToVector256Int32(vector.GetUpper());
return Vector512.Create(lower, upper);
}
Vector512<float> sign = vector & Vector512.Create(-0.0f);
Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608.0f);
val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
return Vector512.ConvertToInt32(val_2p23_f32 | sign);
}
=> Avx512F.ConvertToVector512Int32(vector);
/// <summary>
/// Rounds all values in <paramref name="vector"/> to the nearest integer
@ -112,28 +66,11 @@ internal static class Vector512_
/// <param name="vector">The vector</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<float> RoundToNearestInteger(Vector512<float> vector)
{
if (Avx512F.IsSupported)
{
// imm8 = 0b1000:
// imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers)
// imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions)
return Avx512F.RoundScale(vector, 0b0000_1000);
}
if (Avx.IsSupported)
{
Vector256<float> lower = Avx.RoundToNearestInteger(vector.GetLower());
Vector256<float> upper = Avx.RoundToNearestInteger(vector.GetUpper());
return Vector512.Create(lower, upper);
}
Vector512<float> sign = vector & Vector512.Create(-0F);
Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608F);
val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
return val_2p23_f32 | sign;
}
// imm8 = 0b1000:
// imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers)
// imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions)
=> Avx512F.RoundScale(vector, 0b0000_1000);
/// <summary>
/// Performs a multiplication and an addition of the <see cref="Vector512{Single}"/>.
@ -148,21 +85,7 @@ internal static class Vector512_
Vector512<float> va,
Vector512<float> vm0,
Vector512<float> vm1)
{
if (Avx512F.IsSupported)
{
return Avx512F.FusedMultiplyAdd(vm0, vm1, va);
}
if (Fma.IsSupported)
{
Vector256<float> lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower());
Vector256<float> upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper());
return Vector512.Create(lower, upper);
}
return va + (vm0 * vm1);
}
=> Avx512F.FusedMultiplyAdd(vm0, vm1, va);
/// <summary>
/// Restricts a vector between a minimum and a maximum value.
@ -175,7 +98,4 @@ internal static class Vector512_
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<T> Clamp<T>(Vector512<T> value, Vector512<T> min, Vector512<T> max)
=> Vector512.Min(Vector512.Max(value, min), max);
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

7
src/ImageSharp/Formats/Webp/AlphaDecoder.cs

@ -6,7 +6,6 @@ using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Formats.Webp.BitReader;
@ -314,7 +313,7 @@ internal class AlphaDecoder : IDisposable
private static void HorizontalUnfilter(Span<byte> prev, Span<byte> input, Span<byte> dst, int width)
{
if ((Sse2.IsSupported || AdvSimd.IsSupported) && width >= 9)
if (Vector128.IsHardwareAccelerated && width >= 9)
{
dst[0] = (byte)(input[0] + (prev.IsEmpty ? 0 : prev[0]));
nuint i;
@ -362,7 +361,7 @@ internal class AlphaDecoder : IDisposable
{
HorizontalUnfilter(null, input, dst, width);
}
else if (Avx2.IsSupported)
else if (Vector256.IsHardwareAccelerated)
{
ref byte inputRef = ref MemoryMarshal.GetReference(input);
ref byte prevRef = ref MemoryMarshal.GetReference(prev);
@ -374,7 +373,7 @@ internal class AlphaDecoder : IDisposable
{
Vector256<int> a0 = Unsafe.As<byte, Vector256<int>>(ref Unsafe.Add(ref inputRef, i));
Vector256<int> b0 = Unsafe.As<byte, Vector256<int>>(ref Unsafe.Add(ref prevRef, i));
Vector256<byte> c0 = Avx2.Add(a0.AsByte(), b0.AsByte());
Vector256<byte> c0 = a0.AsByte() + b0.AsByte();
ref byte outputRef = ref Unsafe.Add(ref dstRef, i);
Unsafe.As<byte, Vector256<byte>>(ref outputRef) = c0;
}

145
src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs

@ -4,7 +4,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Webp.Lossless;
@ -12,17 +12,20 @@ internal static class ColorSpaceTransformUtils
{
public static void CollectColorBlueTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
{
if (Avx2.IsSupported && tileWidth >= 16)
if (Vector256.IsHardwareAccelerated && tileWidth >= 16)
{
const int span = 16;
Span<ushort> values = stackalloc ushort[span];
var collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
var collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
var collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
var collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
var collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
// These shuffle masks are safe for use with Avx2.Shuffle because all indices are within their respective 128-bit lanes (0–15 for the low mask, 16–31 for the high mask),
// and all disabled lanes are set to 0xFF to zero those bytes per the vpshufb specification. This guarantees lane-local shuffling with no cross-lane violations.
Vector256<byte> collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
Vector256<byte> collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
Vector256<byte> collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
Vector256<byte> collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector256<byte> collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
Vector256<short> multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
for (int y = 0; y < tileHeight; y++)
{
Span<uint> srcSpan = bgra[(y * stride)..];
@ -33,18 +36,18 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector256<byte> r0 = Avx2.Shuffle(input0, collectColorBlueTransformsShuffleLowMask256);
Vector256<byte> r1 = Avx2.Shuffle(input1, collectColorBlueTransformsShuffleHighMask256);
Vector256<byte> r = Avx2.Or(r0, r1);
Vector256<byte> gb0 = Avx2.And(input0, collectColorBlueTransformsGreenBlueMask256);
Vector256<byte> gb1 = Avx2.And(input1, collectColorBlueTransformsGreenBlueMask256);
Vector256<ushort> gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector256<byte> g = Avx2.And(gb.AsByte(), collectColorBlueTransformsGreenMask256);
Vector256<short> a = Avx2.MultiplyHigh(r.AsInt16(), multsr);
Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg);
Vector256<byte> c = Avx2.Subtract(gb.AsByte(), b.AsByte());
Vector256<byte> d = Avx2.Subtract(c, a.AsByte());
Vector256<byte> e = Avx2.And(d, collectColorBlueTransformsBlueMask256);
Vector256<byte> r0 = Vector256_.ShufflePerLane(input0, collectColorBlueTransformsShuffleLowMask256);
Vector256<byte> r1 = Vector256_.ShufflePerLane(input1, collectColorBlueTransformsShuffleHighMask256);
Vector256<byte> r = r0 | r1;
Vector256<byte> gb0 = input0 & collectColorBlueTransformsGreenBlueMask256;
Vector256<byte> gb1 = input1 & collectColorBlueTransformsGreenBlueMask256;
Vector256<ushort> gb = Vector256_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector256<byte> g = gb.AsByte() & collectColorBlueTransformsGreenMask256;
Vector256<short> a = Vector256_.MultiplyHigh(r.AsInt16(), multsr);
Vector256<short> b = Vector256_.MultiplyHigh(g.AsInt16(), multsg);
Vector256<byte> c = gb.AsByte() - b.AsByte();
Vector256<byte> d = c - a.AsByte();
Vector256<byte> e = d & collectColorBlueTransformsBlueMask256;
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = e.AsUInt16();
@ -59,20 +62,20 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
}
}
else if (Sse41.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
const int span = 8;
Span<ushort> values = stackalloc ushort[span];
var collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
var collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
var collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
var collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
var collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
Vector128<byte> collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
Vector128<byte> collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
Vector128<byte> collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
Vector128<byte> collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector128<byte> collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
Vector128<short> multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
for (int y = 0; y < tileHeight; y++)
{
Span<uint> srcSpan = bgra[(y * stride)..];
@ -83,18 +86,18 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector128<byte> r0 = Ssse3.Shuffle(input0, collectColorBlueTransformsShuffleLowMask);
Vector128<byte> r1 = Ssse3.Shuffle(input1, collectColorBlueTransformsShuffleHighMask);
Vector128<byte> r = Sse2.Or(r0, r1);
Vector128<byte> gb0 = Sse2.And(input0, collectColorBlueTransformsGreenBlueMask);
Vector128<byte> gb1 = Sse2.And(input1, collectColorBlueTransformsGreenBlueMask);
Vector128<ushort> gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector128<byte> g = Sse2.And(gb.AsByte(), collectColorBlueTransformsGreenMask);
Vector128<short> a = Sse2.MultiplyHigh(r.AsInt16(), multsr);
Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg);
Vector128<byte> c = Sse2.Subtract(gb.AsByte(), b.AsByte());
Vector128<byte> d = Sse2.Subtract(c, a.AsByte());
Vector128<byte> e = Sse2.And(d, collectColorBlueTransformsBlueMask);
Vector128<byte> r0 = Vector128_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask);
Vector128<byte> r1 = Vector128_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask);
Vector128<byte> r = r0 | r1;
Vector128<byte> gb0 = input0 & collectColorBlueTransformsGreenBlueMask;
Vector128<byte> gb1 = input1 & collectColorBlueTransformsGreenBlueMask;
Vector128<ushort> gb = Vector128_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector128<byte> g = gb.AsByte() & collectColorBlueTransformsGreenMask;
Vector128<short> a = Vector128_.MultiplyHigh(r.AsInt16(), multsr);
Vector128<short> b = Vector128_.MultiplyHigh(g.AsInt16(), multsg);
Vector128<byte> c = gb.AsByte() - b.AsByte();
Vector128<byte> d = c - a.AsByte();
Vector128<byte> e = d & collectColorBlueTransformsBlueMask;
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = e.AsUInt16();
@ -109,16 +112,16 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
}
}
else
{
CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
CollectColorBlueTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
}
}
private static void CollectColorBlueTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
private static void CollectColorBlueTransformsScalar(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
{
int pos = 0;
while (tileHeight-- > 0)
@ -135,11 +138,11 @@ internal static class ColorSpaceTransformUtils
public static void CollectColorRedTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
{
if (Avx2.IsSupported && tileWidth >= 16)
if (Vector256.IsHardwareAccelerated && tileWidth >= 16)
{
Vector256<byte> collectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte();
Vector256<byte> collectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte();
var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
const int span = 16;
Span<ushort> values = stackalloc ushort[span];
for (int y = 0; y < tileHeight; y++)
@ -152,15 +155,15 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector256<byte> g0 = Avx2.And(input0, collectColorRedTransformsGreenMask256); // 0 0 | g 0
Vector256<byte> g1 = Avx2.And(input1, collectColorRedTransformsGreenMask256);
Vector256<ushort> g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector256<int> a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector256<int> a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16);
Vector256<ushort> a = Avx2.PackUnsignedSaturate(a0, a1); // x r
Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector256<byte> c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r'
Vector256<byte> d = Avx2.And(c, collectColorRedTransformsAndMask256); // 0 r'
Vector256<byte> g0 = input0 & collectColorRedTransformsGreenMask256; // 0 0 | g 0
Vector256<byte> g1 = input1 & collectColorRedTransformsGreenMask256;
Vector256<ushort> g = Vector256_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector256<int> a0 = Vector256.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector256<int> a1 = Vector256.ShiftRightLogical(input1.AsInt32(), 16);
Vector256<ushort> a = Vector256_.PackUnsignedSaturate(a0, a1); // x r
Vector256<short> b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector256<byte> c = a.AsByte() - b.AsByte(); // x r'
Vector256<byte> d = c & collectColorRedTransformsAndMask256; // 0 r'
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = d.AsUInt16();
@ -175,14 +178,14 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
}
}
else if (Sse41.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> collectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte();
Vector128<byte> collectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte();
var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
const int span = 8;
Span<ushort> values = stackalloc ushort[span];
for (int y = 0; y < tileHeight; y++)
@ -195,15 +198,15 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector128<byte> g0 = Sse2.And(input0, collectColorRedTransformsGreenMask); // 0 0 | g 0
Vector128<byte> g1 = Sse2.And(input1, collectColorRedTransformsGreenMask);
Vector128<ushort> g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector128<int> a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector128<int> a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16);
Vector128<ushort> a = Sse41.PackUnsignedSaturate(a0, a1); // x r
Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector128<byte> c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r'
Vector128<byte> d = Sse2.And(c, collectColorRedTransformsAndMask); // 0 r'
Vector128<byte> g0 = input0 & collectColorRedTransformsGreenMask; // 0 0 | g 0
Vector128<byte> g1 = input1 & collectColorRedTransformsGreenMask;
Vector128<ushort> g = Vector128_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector128<int> a0 = Vector128.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector128<int> a1 = Vector128.ShiftRightLogical(input1.AsInt32(), 16);
Vector128<ushort> a = Vector128_.PackUnsignedSaturate(a0, a1); // x r
Vector128<short> b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector128<byte> c = a.AsByte() - b.AsByte(); // x r'
Vector128<byte> d = c & collectColorRedTransformsAndMask; // 0 r'
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = d.AsUInt16();
@ -218,16 +221,16 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
}
}
else
{
CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
CollectColorRedTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
}
}
private static void CollectColorRedTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
private static void CollectColorRedTransformsScalar(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
{
int pos = 0;
while (tileHeight-- > 0)

196
src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs

@ -6,6 +6,7 @@ using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Memory;
namespace SixLabors.ImageSharp.Formats.Webp.Lossless;
@ -94,17 +95,20 @@ internal static unsafe class LosslessUtils
/// <param name="pixelData">The pixel data to apply the transformation.</param>
public static void AddGreenToBlueAndRed(Span<uint> pixelData)
{
if (Avx2.IsSupported && pixelData.Length >= 8)
if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8)
{
Vector256<byte> addGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
// The `255` values disable the write for alpha (A), since 0x80 is set in the control byte (high bit set).
// Each byte index is within its respective 128-bit lane (0–15 and 16–31), so this is safe for per-lane shuffle.
// The high bits are not set for the index bytes, and the values are always < 16 per lane, satisfying AVX2 lane rules.
Vector256<byte> addGreenToBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
Vector256<byte> in0g0g = Avx2.Shuffle(input, addGreenToBlueAndRedMaskAvx2);
Vector256<byte> output = Avx2.Add(input, in0g0g);
Vector256<byte> in0g0g = Vector256_.ShufflePerLane(input, addGreenToBlueAndRedMask);
Vector256<byte> output = input + in0g0g;
Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
i += 8;
}
@ -115,39 +119,17 @@ internal static unsafe class LosslessUtils
AddGreenToBlueAndRedScalar(pixelData[(int)i..]);
}
}
else if (Ssse3.IsSupported && pixelData.Length >= 4)
{
Vector128<byte> addGreenToBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<byte> in0g0g = Ssse3.Shuffle(input, addGreenToBlueAndRedMaskSsse3);
Vector128<byte> output = Sse2.Add(input, in0g0g);
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
i += 4;
}
while (i <= numPixels - 4);
if (i != numPixels)
{
AddGreenToBlueAndRedScalar(pixelData[(int)i..]);
}
}
else if (Sse2.IsSupported && pixelData.Length >= 4)
else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
{
Vector128<byte> addGreenToBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
Vector128<ushort> b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200);
Vector128<ushort> c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g
Vector128<byte> output = Sse2.Add(input.AsByte(), c.AsByte());
Vector128<byte> in0g0g = Vector128_.ShuffleNative(input, addGreenToBlueAndRedMask);
Vector128<byte> output = input + in0g0g;
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
i += 4;
}
@ -180,17 +162,17 @@ internal static unsafe class LosslessUtils
public static void SubtractGreenFromBlueAndRed(Span<uint> pixelData)
{
if (Avx2.IsSupported && pixelData.Length >= 8)
if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8)
{
Vector256<byte> subtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
Vector256<byte> subtractGreenFromBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
Vector256<byte> in0g0g = Avx2.Shuffle(input, subtractGreenFromBlueAndRedMaskAvx2);
Vector256<byte> output = Avx2.Subtract(input, in0g0g);
Vector256<byte> in0g0g = Vector256_.ShufflePerLane(input, subtractGreenFromBlueAndRedMask);
Vector256<byte> output = input - in0g0g;
Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
i += 8;
}
@ -201,39 +183,17 @@ internal static unsafe class LosslessUtils
SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]);
}
}
else if (Ssse3.IsSupported && pixelData.Length >= 4)
{
Vector128<byte> subtractGreenFromBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<byte> in0g0g = Ssse3.Shuffle(input, subtractGreenFromBlueAndRedMaskSsse3);
Vector128<byte> output = Sse2.Subtract(input, in0g0g);
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
i += 4;
}
while (i <= numPixels - 4);
if (i != numPixels)
{
SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]);
}
}
else if (Sse2.IsSupported && pixelData.Length >= 4)
else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
{
Vector128<byte> subtractGreenFromBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
Vector128<ushort> b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200);
Vector128<ushort> c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g
Vector128<byte> output = Sse2.Subtract(input.AsByte(), c.AsByte());
Vector128<byte> in0g0g = Vector128_.ShuffleNative(input, subtractGreenFromBlueAndRedMask);
Vector128<byte> output = input - in0g0g;
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
i += 4;
}
@ -412,7 +372,7 @@ internal static unsafe class LosslessUtils
TransformColorScalar(m, pixelData[(int)idx..], numPixels - (int)idx);
}
}
else if (Sse2.IsSupported && numPixels >= 4)
else if (Vector128.IsHardwareAccelerated && numPixels >= 4)
{
Vector128<byte> transformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector128<byte> transformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
@ -423,16 +383,16 @@ internal static unsafe class LosslessUtils
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
Vector128<byte> a = Sse2.And(input.AsByte(), transformColorAlphaGreenMask);
Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector128<short> e = Sse2.ShiftLeftLogical(input.AsInt16(), 8);
Vector128<short> f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
Vector128<int> g = Sse2.ShiftRightLogical(f.AsInt32(), 16);
Vector128<byte> h = Sse2.Add(g.AsByte(), d.AsByte());
Vector128<byte> i = Sse2.And(h, transformColorRedBlueMask);
Vector128<byte> output = Sse2.Subtract(input.AsByte(), i);
Vector128<byte> a = input.AsByte() & transformColorAlphaGreenMask;
Vector128<short> b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector128<short> e = Vector128_.ShiftLeftLogical(input.AsInt16(), 8);
Vector128<short> f = Vector128_.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
Vector128<int> g = Vector128.ShiftRightLogical(f.AsInt32(), 16);
Vector128<byte> h = g.AsByte() + d.AsByte();
Vector128<byte> i = h & transformColorRedBlueMask;
Vector128<byte> output = input.AsByte() - i;
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
idx += 4;
}
@ -503,7 +463,7 @@ internal static unsafe class LosslessUtils
TransformColorInverseScalar(m, pixelData[(int)idx..]);
}
}
else if (Sse2.IsSupported && pixelData.Length >= 4)
else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
{
Vector128<byte> transformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector128<int> multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
@ -514,17 +474,17 @@ internal static unsafe class LosslessUtils
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
Vector128<byte> a = Sse2.And(input.AsByte(), transformColorInverseAlphaGreenMask);
Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector128<byte> e = Sse2.Add(input.AsByte(), d.AsByte());
Vector128<short> f = Sse2.ShiftLeftLogical(e.AsInt16(), 8);
Vector128<short> g = Sse2.MultiplyHigh(f, multsb2.AsInt16());
Vector128<int> h = Sse2.ShiftRightLogical(g.AsInt32(), 8);
Vector128<byte> i = Sse2.Add(h.AsByte(), f.AsByte());
Vector128<short> j = Sse2.ShiftRightLogical(i.AsInt16(), 8);
Vector128<byte> output = Sse2.Or(j.AsByte(), a);
Vector128<byte> a = input.AsByte() & transformColorInverseAlphaGreenMask;
Vector128<short> b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector128<byte> e = input.AsByte() + d.AsByte();
Vector128<short> f = Vector128_.ShiftLeftLogical(e.AsInt16(), 8);
Vector128<short> g = Vector128_.MultiplyHigh(f, multsb2.AsInt16());
Vector128<int> h = Vector128.ShiftRightLogical(g.AsInt32(), 8);
Vector128<byte> i = h.AsByte() + f.AsByte();
Vector128<short> j = Vector128.ShiftRightLogical(i.AsInt16(), 8);
Vector128<byte> output = j.AsByte() | a;
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
}
@ -1401,15 +1361,15 @@ internal static unsafe class LosslessUtils
private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
Vector128<short> v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16());
Vector128<short> v2 = Sse2.Subtract(v1, c2Vec.AsInt16());
Vector128<byte> b = Sse2.PackUnsignedSaturate(v2, v2);
return Sse2.ConvertToUInt32(b.AsUInt32());
Vector128<byte> c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c2Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128<byte>.Zero);
Vector128<short> v1 = c0Vec.AsInt16() + c1Vec.AsInt16();
Vector128<short> v2 = v1 - c2Vec.AsInt16();
Vector128<byte> b = Vector128_.PackUnsignedSaturate(v2, v2);
return b.AsUInt32().ToScalar();
}
{
@ -1432,20 +1392,20 @@ internal static unsafe class LosslessUtils
private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
Vector128<byte> b0 = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
Vector128<short> avg = Sse2.Add(c1Vec.AsInt16(), c0Vec.AsInt16());
Vector128<short> a0 = Sse2.ShiftRightLogical(avg, 1);
Vector128<short> a1 = Sse2.Subtract(a0, b0.AsInt16());
Vector128<short> bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16());
Vector128<short> a2 = Sse2.Subtract(a1, bgta);
Vector128<short> a3 = Sse2.ShiftRightArithmetic(a2, 1);
Vector128<short> a4 = Sse2.Add(a0, a3).AsInt16();
Vector128<byte> a5 = Sse2.PackUnsignedSaturate(a4, a4);
return Sse2.ConvertToUInt32(a5.AsUInt32());
Vector128<byte> c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128<byte>.Zero);
Vector128<byte> b0 = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128<byte>.Zero);
Vector128<short> avg = c1Vec.AsInt16() + c0Vec.AsInt16();
Vector128<short> a0 = Vector128.ShiftRightLogical(avg, 1);
Vector128<short> a1 = a0 - b0.AsInt16();
Vector128<short> bgta = Vector128.GreaterThan(b0.AsInt16(), a0.AsInt16());
Vector128<short> a2 = a1 - bgta;
Vector128<short> a3 = Vector128.ShiftRightArithmetic(a2, 1);
Vector128<short> a4 = (a0 + a3).AsInt16();
Vector128<byte> a5 = Vector128_.PackUnsignedSaturate(a4, a4);
return a5.AsUInt32().ToScalar();
}
{
@ -1475,23 +1435,23 @@ internal static unsafe class LosslessUtils
private static uint Select(uint a, uint b, uint c, Span<short> scratch)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
fixed (short* ptr = &MemoryMarshal.GetReference(scratch))
{
Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte();
Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte();
Vector128<byte> c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte();
Vector128<byte> ac0 = Sse2.SubtractSaturate(a0, c0);
Vector128<byte> ca0 = Sse2.SubtractSaturate(c0, a0);
Vector128<byte> bc0 = Sse2.SubtractSaturate(b0, c0);
Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0);
Vector128<byte> ac = Sse2.Or(ac0, ca0);
Vector128<byte> bc = Sse2.Or(bc0, cb0);
Vector128<byte> pa = Sse2.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
Vector128<byte> pb = Sse2.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
Sse2.Store((ushort*)ptr, diff);
Vector128<byte> a0 = Vector128.CreateScalar(a).AsByte();
Vector128<byte> b0 = Vector128.CreateScalar(b).AsByte();
Vector128<byte> c0 = Vector128.CreateScalar(c).AsByte();
Vector128<byte> ac0 = Vector128_.SubtractSaturate(a0, c0);
Vector128<byte> ca0 = Vector128_.SubtractSaturate(c0, a0);
Vector128<byte> bc0 = Vector128_.SubtractSaturate(b0, c0);
Vector128<byte> cb0 = Vector128_.SubtractSaturate(c0, b0);
Vector128<byte> ac = ac0 | ca0;
Vector128<byte> bc = bc0 | cb0;
Vector128<byte> pa = Vector128_.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
Vector128<byte> pb = Vector128_.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
Vector128<ushort> diff = pb.AsUInt16() - pa.AsUInt16();
diff.Store((ushort*)ptr);
int paMinusPb = ptr[3] + ptr[2] + ptr[1] + ptr[0];
return (paMinusPb <= 0) ? a : b;
}

1273
src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

File diff suppressed because it is too large

355
src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs

@ -2,10 +2,11 @@
// Licensed under the Six Labors Split License.
using System.Buffers.Binary;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Webp.Lossy;
@ -78,7 +79,7 @@ internal static unsafe class Vp8Encoding
// Does two inverse transforms.
public static void ITransformTwo(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
@ -116,10 +117,10 @@ internal static unsafe class Vp8Encoding
Vector128<long> inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
Vector128<long> inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);
in0 = Sse2.UnpackLow(in0, inb0);
in1 = Sse2.UnpackLow(in1, inb1);
in2 = Sse2.UnpackLow(in2, inb2);
in3 = Sse2.UnpackLow(in3, inb3);
in0 = Vector128_.UnpackLow(in0, inb0);
in1 = Vector128_.UnpackLow(in1, inb1);
in2 = Vector128_.UnpackLow(in2, inb2);
in3 = Vector128_.UnpackLow(in3, inb3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
@ -128,49 +129,45 @@ internal static unsafe class Vp8Encoding
// Vertical pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
// Horizontal pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
// Add inverse transform to 'ref' and store.
// Load the reference(s).
Vector128<byte> ref0 = Vector128<byte>.Zero;
Vector128<byte> ref1 = Vector128<byte>.Zero;
Vector128<byte> ref2 = Vector128<byte>.Zero;
Vector128<byte> ref3 = Vector128<byte>.Zero;
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
// Load eight bytes/pixels per line.
ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
Vector128<byte> ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
Vector128<byte> ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
Vector128<byte> ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
Vector128<byte> ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
// Convert to 16b.
ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
ref0 = Vector128_.UnpackLow(ref0, Vector128<byte>.Zero);
ref1 = Vector128_.UnpackLow(ref1, Vector128<byte>.Zero);
ref2 = Vector128_.UnpackLow(ref2, Vector128<byte>.Zero);
ref3 = Vector128_.UnpackLow(ref3, Vector128<byte>.Zero);
// Add the inverse transform(s).
Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
Vector128<short> ref0InvAdded = ref0.AsInt16() + t0.AsInt16();
Vector128<short> ref1InvAdded = ref1.AsInt16() + t1.AsInt16();
Vector128<short> ref2InvAdded = ref2.AsInt16() + t2.AsInt16();
Vector128<short> ref3InvAdded = ref3.AsInt16() + t3.AsInt16();
// Unsigned saturate to 8b.
ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
// Store eight bytes/pixels per line.
ref byte outputRef = ref MemoryMarshal.GetReference(dst);
@ -188,7 +185,7 @@ internal static unsafe class Vp8Encoding
public static void ITransformOne(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
// Load and concatenate the transform coefficients (we'll do two inverse
// transforms in parallel). In the case of only one inverse transform, the
@ -207,63 +204,59 @@ internal static unsafe class Vp8Encoding
// Vertical pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
// Horizontal pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
// Add inverse transform to 'ref' and store.
// Load the reference(s).
Vector128<byte> ref0 = Vector128<byte>.Zero;
Vector128<byte> ref1 = Vector128<byte>.Zero;
Vector128<byte> ref2 = Vector128<byte>.Zero;
Vector128<byte> ref3 = Vector128<byte>.Zero;
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
// Load four bytes/pixels per line.
ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
Vector128<byte> ref0 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref referenceRef)).AsByte();
Vector128<byte> ref1 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
Vector128<byte> ref2 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
Vector128<byte> ref3 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
// Convert to 16b.
ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
ref0 = Vector128_.UnpackLow(ref0, Vector128<byte>.Zero);
ref1 = Vector128_.UnpackLow(ref1, Vector128<byte>.Zero);
ref2 = Vector128_.UnpackLow(ref2, Vector128<byte>.Zero);
ref3 = Vector128_.UnpackLow(ref3, Vector128<byte>.Zero);
// Add the inverse transform(s).
Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
Vector128<short> ref0InvAdded = ref0.AsInt16() + t0.AsInt16();
Vector128<short> ref1InvAdded = ref1.AsInt16() + t1.AsInt16();
Vector128<short> ref2InvAdded = ref2.AsInt16() + t2.AsInt16();
Vector128<short> ref3InvAdded = ref3.AsInt16() + t3.AsInt16();
// Unsigned saturate to 8b.
ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
// Unsigned saturate to 8b.
ref byte outputRef = ref MemoryMarshal.GetReference(dst);
// Store four bytes/pixels per line.
int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
Unsafe.As<byte, int>(ref outputRef) = output0;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
int output0 = ref0.AsInt32().ToScalar();
int output1 = ref1.AsInt32().ToScalar();
int output2 = ref2.AsInt32().ToScalar();
int output3 = ref3.AsInt32().ToScalar();
Unsafe.WriteUnaligned(ref outputRef, output0);
Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps), output1);
Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2), output2);
Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3), output3);
}
else
{
@ -302,72 +295,72 @@ internal static unsafe class Vp8Encoding
}
}
private static void InverseTransformVerticalPass(Vector128<long> in0, Vector128<long> in2, Vector128<long> in1, Vector128<long> in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3)
private static void InverseTransformVerticalPassVector128(Vector128<long> in0, Vector128<long> in2, Vector128<long> in1, Vector128<long> in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3)
{
Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
Vector128<short> a = in0.AsInt16() + in2.AsInt16();
Vector128<short> b = in0.AsInt16() - in2.AsInt16();
Vector128<short> k1 = Vector128.Create((short)20091).AsInt16();
Vector128<short> k2 = Vector128.Create((short)-30068).AsInt16();
// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2);
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1);
Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
Vector128<short> c4 = Sse2.Subtract(c1, c2);
Vector128<short> c = Sse2.Add(c3, c4);
Vector128<short> c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2);
Vector128<short> c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1);
Vector128<short> c3 = in1.AsInt16() - in3.AsInt16();
Vector128<short> c4 = c1 - c2;
Vector128<short> c = c3 + c4;
// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1);
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2);
Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
Vector128<short> d4 = Sse2.Add(d1, d2);
Vector128<short> d = Sse2.Add(d3, d4);
Vector128<short> d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1);
Vector128<short> d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2);
Vector128<short> d3 = in1.AsInt16() + in3.AsInt16();
Vector128<short> d4 = d1 + d2;
Vector128<short> d = d3 + d4;
// Second pass.
tmp0 = Sse2.Add(a, d);
tmp1 = Sse2.Add(b, c);
tmp2 = Sse2.Subtract(b, c);
tmp3 = Sse2.Subtract(a, d);
tmp0 = a + d;
tmp1 = b + c;
tmp2 = b - c;
tmp3 = a - d;
}
private static void InverseTransformHorizontalPass(Vector128<long> t0, Vector128<long> t2, Vector128<long> t1, Vector128<long> t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3)
private static void InverseTransformHorizontalPassVector128(Vector128<long> t0, Vector128<long> t2, Vector128<long> t1, Vector128<long> t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3)
{
Vector128<short> dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4));
Vector128<short> a = Sse2.Add(dc, t2.AsInt16());
Vector128<short> b = Sse2.Subtract(dc, t2.AsInt16());
Vector128<short> dc = t0.AsInt16() + Vector128.Create((short)4);
Vector128<short> a = dc + t2.AsInt16();
Vector128<short> b = dc - t2.AsInt16();
Vector128<short> k1 = Vector128.Create((short)20091).AsInt16();
Vector128<short> k2 = Vector128.Create((short)-30068).AsInt16();
// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
Vector128<short> c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2);
Vector128<short> c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1);
Vector128<short> c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
Vector128<short> c4 = Sse2.Subtract(c1, c2);
Vector128<short> c = Sse2.Add(c3, c4);
Vector128<short> c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2);
Vector128<short> c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1);
Vector128<short> c3 = t1.AsInt16() - t3.AsInt16();
Vector128<short> c4 = c1 - c2;
Vector128<short> c = c3 + c4;
// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
Vector128<short> d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1);
Vector128<short> d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2);
Vector128<short> d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
Vector128<short> d4 = Sse2.Add(d1, d2);
Vector128<short> d = Sse2.Add(d3, d4);
Vector128<short> d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1);
Vector128<short> d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2);
Vector128<short> d3 = t1.AsInt16() + t3.AsInt16();
Vector128<short> d4 = d1 + d2;
Vector128<short> d = d3 + d4;
// Second pass.
Vector128<short> tmp0 = Sse2.Add(a, d);
Vector128<short> tmp1 = Sse2.Add(b, c);
Vector128<short> tmp2 = Sse2.Subtract(b, c);
Vector128<short> tmp3 = Sse2.Subtract(a, d);
shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
Vector128<short> tmp0 = a + d;
Vector128<short> tmp1 = b + c;
Vector128<short> tmp2 = b - c;
Vector128<short> tmp3 = a - d;
shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3);
shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3);
shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3);
shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3);
}
public static void FTransform2(Span<byte> src, Span<byte> reference, Span<short> output, Span<short> output2, Span<int> scratch)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
ref byte srcRef = ref MemoryMarshal.GetReference(src);
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
@ -385,38 +378,38 @@ internal static unsafe class Vp8Encoding
Vector128<long> ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0);
// Convert both to 16 bit.
Vector128<byte> srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow0 = Vector128_.UnpackLow(src0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow1 = Vector128_.UnpackLow(src1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow2 = Vector128_.UnpackLow(src2.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow3 = Vector128_.UnpackLow(src3.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow0 = Vector128_.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow1 = Vector128_.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow2 = Vector128_.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow3 = Vector128_.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero);
// Compute difference. -> 00 01 02 03 00' 01' 02' 03'
Vector128<short> diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16());
Vector128<short> diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16());
Vector128<short> diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16());
Vector128<short> diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16());
Vector128<short> diff0 = srcLow0.AsInt16() - refLow0.AsInt16();
Vector128<short> diff1 = srcLow1.AsInt16() - refLow1.AsInt16();
Vector128<short> diff2 = srcLow2.AsInt16() - refLow2.AsInt16();
Vector128<short> diff3 = srcLow3.AsInt16() - refLow3.AsInt16();
// Unpack and shuffle.
// 00 01 02 03 0 0 0 0
// 10 11 12 13 0 0 0 0
// 20 21 22 23 0 0 0 0
// 30 31 32 33 0 0 0 0
Vector128<int> shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
Vector128<int> shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
Vector128<int> shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
Vector128<int> shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());
Vector128<int> shuf01l = Vector128_.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
Vector128<int> shuf23l = Vector128_.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
Vector128<int> shuf01h = Vector128_.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
Vector128<int> shuf23h = Vector128_.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());
// First pass.
FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l);
FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h);
FTransformPass1Vector128(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l);
FTransformPass1Vector128(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h);
// Second pass.
FTransformPass2SSE2(v01l, v32l, output);
FTransformPass2SSE2(v01h, v32h, output2);
FTransformPass2Vector128(v01l, v32l, output);
FTransformPass2Vector128(v01h, v32h, output2);
}
else
{
@ -427,7 +420,7 @@ internal static unsafe class Vp8Encoding
public static void FTransform(Span<byte> src, Span<byte> reference, Span<short> output, Span<int> scratch)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
ref byte srcRef = ref MemoryMarshal.GetReference(src);
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
@ -449,29 +442,29 @@ internal static unsafe class Vp8Encoding
// 20 21 22 23 *
// 30 31 32 33 *
// Shuffle.
Vector128<short> srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16());
Vector128<short> srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16());
Vector128<short> refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
Vector128<short> refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16());
Vector128<short> srcLow0 = Vector128_.UnpackLow(src0.AsInt16(), src1.AsInt16());
Vector128<short> srcLow1 = Vector128_.UnpackLow(src2.AsInt16(), src3.AsInt16());
Vector128<short> refLow0 = Vector128_.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
Vector128<short> refLow1 = Vector128_.UnpackLow(ref2.AsInt16(), ref3.AsInt16());
// 00 01 10 11 02 03 12 13 * * ...
// 20 21 30 31 22 22 32 33 * * ...
// Convert both to 16 bit.
Vector128<byte> src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> src0_16b = Vector128_.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> src1_16b = Vector128_.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> ref0_16b = Vector128_.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> ref1_16b = Vector128_.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero);
// Compute the difference.
Vector128<short> row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16());
Vector128<short> row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16());
Vector128<short> row01 = src0_16b.AsInt16() - ref0_16b.AsInt16();
Vector128<short> row23 = src1_16b.AsInt16() - ref1_16b.AsInt16();
// First pass.
FTransformPass1SSE2(row01, row23, out Vector128<int> v01, out Vector128<int> v32);
FTransformPass1Vector128(row01, row23, out Vector128<int> v01, out Vector128<int> v32);
// Second pass.
FTransformPass2SSE2(v01, v32, output);
FTransformPass2Vector128(v01, v32, output);
}
else
{
@ -517,88 +510,88 @@ internal static unsafe class Vp8Encoding
}
}
public static void FTransformPass1SSE2(Vector128<short> row01, Vector128<short> row23, out Vector128<int> out01, out Vector128<int> out32)
public static void FTransformPass1Vector128(Vector128<short> row01, Vector128<short> row23, out Vector128<int> out01, out Vector128<int> out32)
{
// *in01 = 00 01 10 11 02 03 12 13
// *in23 = 20 21 30 31 22 23 32 33
Vector128<short> shuf01_p = Sse2.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301);
Vector128<short> shuf32_p = Sse2.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301);
Vector128<short> shuf01_p = Vector128_.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301);
Vector128<short> shuf32_p = Vector128_.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301);
// 00 01 10 11 03 02 13 12
// 20 21 30 31 23 22 33 32
Vector128<long> s01 = Sse2.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64());
Vector128<long> s32 = Sse2.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64());
Vector128<long> s01 = Vector128_.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64());
Vector128<long> s32 = Vector128_.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64());
// 00 01 10 11 20 21 30 31
// 03 02 13 12 23 22 33 32
Vector128<short> a01 = Sse2.Add(s01.AsInt16(), s32.AsInt16());
Vector128<short> a32 = Sse2.Subtract(s01.AsInt16(), s32.AsInt16());
Vector128<short> a01 = s01.AsInt16() + s32.AsInt16();
Vector128<short> a32 = s01.AsInt16() - s32.AsInt16();
// [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
// [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
// [ (a0 + a1) << 3, ... ]
Vector128<int> tmp0 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p
Vector128<int> tmp0 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p
// [ (a0 - a1) << 3, ... ]
Vector128<int> tmp2 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16()); // K88m
Vector128<int> tmp11 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16()); // K5352_2217p
Vector128<int> tmp31 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16()); // K5352_2217m
Vector128<int> tmp12 = Sse2.Add(tmp11, Vector128.Create(1812));
Vector128<int> tmp32 = Sse2.Add(tmp31, Vector128.Create(937));
Vector128<int> tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9);
Vector128<int> tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9);
Vector128<short> s03 = Sse2.PackSignedSaturate(tmp0, tmp2);
Vector128<short> s12 = Sse2.PackSignedSaturate(tmp1, tmp3);
Vector128<short> slo = Sse2.UnpackLow(s03, s12); // 0 1 0 1 0 1...
Vector128<short> shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3
Vector128<int> v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32());
out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32());
out32 = Sse2.Shuffle(v23, SimdUtils.Shuffle.MMShuffle1032);
Vector128<int> tmp2 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16()); // K88m
Vector128<int> tmp11 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16()); // K5352_2217p
Vector128<int> tmp31 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16()); // K5352_2217m
Vector128<int> tmp12 = tmp11 + Vector128.Create(1812);
Vector128<int> tmp32 = tmp31 + Vector128.Create(937);
Vector128<int> tmp1 = Vector128.ShiftRightArithmetic(tmp12, 9);
Vector128<int> tmp3 = Vector128.ShiftRightArithmetic(tmp32, 9);
Vector128<short> s03 = Vector128_.PackSignedSaturate(tmp0, tmp2);
Vector128<short> s12 = Vector128_.PackSignedSaturate(tmp1, tmp3);
Vector128<short> slo = Vector128_.UnpackLow(s03, s12); // 0 1 0 1 0 1...
Vector128<short> shi = Vector128_.UnpackHigh(s03, s12); // 2 3 2 3 2 3
Vector128<int> v23 = Vector128_.UnpackHigh(slo.AsInt32(), shi.AsInt32());
out01 = Vector128_.UnpackLow(slo.AsInt32(), shi.AsInt32());
out32 = Vector128_.ShuffleNative(v23, SimdUtils.Shuffle.MMShuffle1032);
}
public static void FTransformPass2SSE2(Vector128<int> v01, Vector128<int> v32, Span<short> output)
public static void FTransformPass2Vector128(Vector128<int> v01, Vector128<int> v32, Span<short> output)
{
// Same operations are done on the (0,3) and (1,2) pairs.
// a3 = v0 - v3
// a2 = v1 - v2
Vector128<short> a32 = Sse2.Subtract(v01.AsInt16(), v32.AsInt16());
Vector128<long> a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64());
Vector128<short> a32 = v01.AsInt16() - v32.AsInt16();
Vector128<long> a22 = Vector128_.UnpackHigh(a32.AsInt64(), a32.AsInt64());
Vector128<short> b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16());
Vector128<int> c1 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16()); // K5352_2217
Vector128<int> c3 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16()); // K2217_5352
Vector128<int> d1 = Sse2.Add(c1, Vector128.Create(12000 + (1 << 16))); // K12000PlusOne
Vector128<int> d3 = Sse2.Add(c3, Vector128.Create(51000));
Vector128<int> e1 = Sse2.ShiftRightArithmetic(d1, 16);
Vector128<int> e3 = Sse2.ShiftRightArithmetic(d3, 16);
Vector128<short> b23 = Vector128_.UnpackLow(a22.AsInt16(), a32.AsInt16());
Vector128<int> c1 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16()); // K5352_2217
Vector128<int> c3 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16()); // K2217_5352
Vector128<int> d1 = c1 + Vector128.Create(12000 + (1 << 16)); // K12000PlusOne
Vector128<int> d3 = c3 + Vector128.Create(51000);
Vector128<int> e1 = Vector128.ShiftRightArithmetic(d1, 16);
Vector128<int> e3 = Vector128.ShiftRightArithmetic(d3, 16);
// f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
// f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
Vector128<short> f1 = Sse2.PackSignedSaturate(e1, e1);
Vector128<short> f3 = Sse2.PackSignedSaturate(e3, e3);
Vector128<short> f1 = Vector128_.PackSignedSaturate(e1, e1);
Vector128<short> f3 = Vector128_.PackSignedSaturate(e3, e3);
// g1 = f1 + (a3 != 0);
// The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
// desired (0, 1), we add one earlier through k12000_plus_one.
// -> g1 = f1 + 1 - (a3 == 0)
Vector128<short> g1 = Sse2.Add(f1, Sse2.CompareEqual(a32, Vector128<short>.Zero));
Vector128<short> g1 = f1 + Vector128.Equals(a32, Vector128<short>.Zero);
// a0 = v0 + v3
// a1 = v1 + v2
Vector128<short> a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16());
Vector128<short> a01Plus7 = Sse2.Add(a01.AsInt16(), Vector128.Create((short)7));
Vector128<short> a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
Vector128<short> c0 = Sse2.Add(a01Plus7, a11);
Vector128<short> c2 = Sse2.Subtract(a01Plus7, a11);
Vector128<short> a01 = v01.AsInt16() + v32.AsInt16();
Vector128<short> a01Plus7 = a01.AsInt16() + Vector128.Create((short)7);
Vector128<short> a11 = Vector128_.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
Vector128<short> c0 = a01Plus7 + a11;
Vector128<short> c2 = a01Plus7 - a11;
// d0 = (a0 + a1 + 7) >> 4;
// d2 = (a0 - a1 + 7) >> 4;
Vector128<short> d0 = Sse2.ShiftRightArithmetic(c0, 4);
Vector128<short> d2 = Sse2.ShiftRightArithmetic(c2, 4);
Vector128<short> d0 = Vector128.ShiftRightArithmetic(c0, 4);
Vector128<short> d2 = Vector128.ShiftRightArithmetic(c2, 4);
Vector128<long> d0g1 = Sse2.UnpackLow(d0.AsInt64(), g1.AsInt64());
Vector128<long> d2f3 = Sse2.UnpackLow(d2.AsInt64(), f3.AsInt64());
Vector128<long> d0g1 = Vector128_.UnpackLow(d0.AsInt64(), g1.AsInt64());
Vector128<long> d2f3 = Vector128_.UnpackLow(d2.AsInt64(), f3.AsInt64());
ref short outputRef = ref MemoryMarshal.GetReference(output);
Unsafe.As<short, Vector128<short>>(ref outputRef) = d0g1.AsInt16();

209
src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs

@ -5,7 +5,7 @@ using System.Buffers;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
@ -29,9 +29,9 @@ internal static class YuvConversion
// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
public static void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
{
if (Sse41.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
UpSampleVector128(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
}
else
{
@ -107,7 +107,7 @@ internal static class YuvConversion
//
// Then m can be written as
// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
private static void UpSampleSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
private static void UpSampleVector128(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
{
const int xStep = 3;
Array.Clear(uvBuffer);
@ -138,18 +138,18 @@ internal static class YuvConversion
{
for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
{
UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
ConvertYuvToBgrWithBottomYVector128(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
}
}
else
{
for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
{
UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep);
UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
ConvertYuvToBgrVector128(topY, topDst, ru, rv, pos, xStep);
}
}
@ -161,18 +161,18 @@ internal static class YuvConversion
Span<byte> tmpBottomDst = tmpTopDst[(4 * 32)..];
Span<byte> tmpTop = tmpBottomDst[(4 * 32)..];
Span<byte> tmpBottom = bottomY.IsEmpty ? null : tmpTop[32..];
UpSampleLastBlock(topU[uvPos..], curU[uvPos..], leftOver, ru);
UpSampleLastBlock(topV[uvPos..], curV[uvPos..], leftOver, rv);
UpSampleLastBlockVector128(topU[uvPos..], curU[uvPos..], leftOver, ru);
UpSampleLastBlockVector128(topV[uvPos..], curV[uvPos..], leftOver, rv);
topY[pos..len].CopyTo(tmpTop);
if (!bottomY.IsEmpty)
{
bottomY[pos..len].CopyTo(tmpBottom);
ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
ConvertYuvToBgrWithBottomYVector128(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
}
else
{
ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep);
ConvertYuvToBgrVector128(tmpTop, tmpTopDst, ru, rv, 0, xStep);
}
tmpTopDst[..((len - pos) * xStep)].CopyTo(topDst[(pos * xStep)..]);
@ -184,7 +184,7 @@ internal static class YuvConversion
}
// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
private static void UpSample32Pixels(ref byte r1, ref byte r2, Span<byte> output)
private static void UpSample32PixelsVector128(ref byte r1, ref byte r2, Span<byte> output)
{
// Load inputs.
Vector128<byte> a = Unsafe.As<byte, Vector128<byte>>(ref r1);
@ -192,28 +192,28 @@ internal static class YuvConversion
Vector128<byte> c = Unsafe.As<byte, Vector128<byte>>(ref r2);
Vector128<byte> d = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref r2, 1));
Vector128<byte> s = Sse2.Average(a, d); // s = (a + d + 1) / 2
Vector128<byte> t = Sse2.Average(b, c); // t = (b + c + 1) / 2
Vector128<byte> st = Sse2.Xor(s, t); // st = s^t
Vector128<byte> s = Vector128_.Average(a, d); // s = (a + d + 1) / 2
Vector128<byte> t = Vector128_.Average(b, c); // t = (b + c + 1) / 2
Vector128<byte> st = s ^ t; // st = s^t
Vector128<byte> ad = Sse2.Xor(a, d); // ad = a^d
Vector128<byte> bc = Sse2.Xor(b, c); // bc = b^c
Vector128<byte> ad = a ^ d; // ad = a^d
Vector128<byte> bc = b ^ c; // bc = b^c
Vector128<byte> t1 = Sse2.Or(ad, bc); // (a^d) | (b^c)
Vector128<byte> t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t)
Vector128<byte> t3 = Sse2.And(t2, Vector128.Create((byte)1)); // (a^d) | (b^c) | (s^t) & 1
Vector128<byte> t4 = Sse2.Average(s, t);
Vector128<byte> k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4
Vector128<byte> t1 = ad | bc; // (a^d) | (b^c)
Vector128<byte> t2 = t1 | st; // (a^d) | (b^c) | (s^t)
Vector128<byte> t3 = t2 & Vector128.Create((byte)1); // (a^d) | (b^c) | (s^t) & 1
Vector128<byte> t4 = Vector128_.Average(s, t);
Vector128<byte> k = t4 - t3; // k = (a + b + c + d) / 4
Vector128<byte> diag1 = GetM(k, st, bc, t);
Vector128<byte> diag2 = GetM(k, st, ad, s);
Vector128<byte> diag1 = GetMVector128(k, st, bc, t);
Vector128<byte> diag2 = GetMVector128(k, st, ad, s);
// Pack the alternate pixels.
PackAndStore(a, b, diag1, diag2, output); // store top.
PackAndStore(c, d, diag2, diag1, output[(2 * 32)..]);
PackAndStoreVector128(a, b, diag1, diag2, output); // store top.
PackAndStoreVector128(c, d, diag2, diag1, output[(2 * 32)..]);
}
private static void UpSampleLastBlock(Span<byte> tb, Span<byte> bb, int numPixels, Span<byte> output)
private static void UpSampleLastBlockVector128(Span<byte> tb, Span<byte> bb, int numPixels, Span<byte> output)
{
Span<byte> r1 = stackalloc byte[17];
Span<byte> r2 = stackalloc byte[17];
@ -230,27 +230,27 @@ internal static class YuvConversion
ref byte r1Ref = ref MemoryMarshal.GetReference(r1);
ref byte r2Ref = ref MemoryMarshal.GetReference(r2);
UpSample32Pixels(ref r1Ref, ref r2Ref, output);
UpSample32PixelsVector128(ref r1Ref, ref r2Ref, output);
}
// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
private static Vector128<byte> GetM(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
private static Vector128<byte> GetMVector128(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
{
Vector128<byte> tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2
Vector128<byte> tmp1 = Sse2.And(ij, st); // (ij) & (s^t)
Vector128<byte> tmp2 = Sse2.Xor(k, input); // (k^in)
Vector128<byte> tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in)
Vector128<byte> tmp4 = Sse2.And(tmp3, Vector128.Create((byte)1)); // & 1 -> lsb_correction
Vector128<byte> tmp0 = Vector128_.Average(k, input); // (k + in + 1) / 2
Vector128<byte> tmp1 = ij & st; // (ij) & (s^t)
Vector128<byte> tmp2 = k ^ input; // (k^in)
Vector128<byte> tmp3 = tmp1 | tmp2; // ((ij) & (s^t)) | (k^in)
Vector128<byte> tmp4 = tmp3 & Vector128.Create((byte)1); // & 1 -> lsb_correction
return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction
return tmp0 - tmp4; // (k + in + 1) / 2 - lsb_correction
}
private static void PackAndStore(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
private static void PackAndStoreVector128(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
{
Vector128<byte> ta = Sse2.Average(a, da); // (9a + 3b + 3c + d + 8) / 16
Vector128<byte> tb = Sse2.Average(b, db); // (3a + 9b + c + 3d + 8) / 16
Vector128<byte> t1 = Sse2.UnpackLow(ta, tb);
Vector128<byte> t2 = Sse2.UnpackHigh(ta, tb);
Vector128<byte> ta = Vector128_.Average(a, da); // (9a + 3b + 3c + d + 8) / 16
Vector128<byte> tb = Vector128_.Average(b, db); // (3a + 9b + c + 3d + 8) / 16
Vector128<byte> t1 = Vector128_.UnpackLow(ta, tb);
Vector128<byte> t2 = Vector128_.UnpackHigh(ta, tb);
ref byte output0Ref = ref MemoryMarshal.GetReference(output);
ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16);
@ -562,41 +562,42 @@ internal static class YuvConversion
}
[MethodImpl(InliningOptions.ShortMethod)]
private static void ConvertYuvToBgrSse41(Span<byte> topY, Span<byte> topDst, Span<byte> ru, Span<byte> rv, int curX, int step) => YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
private static void ConvertYuvToBgrVector128(Span<byte> topY, Span<byte> topDst, Span<byte> ru, Span<byte> rv, int curX, int step)
=> YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);
[MethodImpl(InliningOptions.ShortMethod)]
private static void ConvertYuvToBgrWithBottomYSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
private static void ConvertYuvToBgrWithBottomYVector128(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
{
YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
YuvToBgrSse41(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);
YuvToBgrVector128(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
}
private static void YuvToBgrSse41(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
private static void YuvToBgrVector128(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
{
ref byte yRef = ref MemoryMarshal.GetReference(y);
ref byte uRef = ref MemoryMarshal.GetReference(u);
ref byte vRef = ref MemoryMarshal.GetReference(v);
ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);
ConvertYuv444ToBgrVector128(ref yRef, ref uRef, ref vRef, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);
// Cast to 8b and store as BBBBGGGGRRRR.
Vector128<byte> bgr0 = Sse2.PackUnsignedSaturate(b0, b1);
Vector128<byte> bgr1 = Sse2.PackUnsignedSaturate(b2, b3);
Vector128<byte> bgr2 = Sse2.PackUnsignedSaturate(g0, g1);
Vector128<byte> bgr3 = Sse2.PackUnsignedSaturate(g2, g3);
Vector128<byte> bgr4 = Sse2.PackUnsignedSaturate(r0, r1);
Vector128<byte> bgr5 = Sse2.PackUnsignedSaturate(r2, r3);
Vector128<byte> bgr0 = Vector128_.PackUnsignedSaturate(b0, b1);
Vector128<byte> bgr1 = Vector128_.PackUnsignedSaturate(b2, b3);
Vector128<byte> bgr2 = Vector128_.PackUnsignedSaturate(g0, g1);
Vector128<byte> bgr3 = Vector128_.PackUnsignedSaturate(g2, g3);
Vector128<byte> bgr4 = Vector128_.PackUnsignedSaturate(r0, r1);
Vector128<byte> bgr5 = Vector128_.PackUnsignedSaturate(r2, r3);
// Pack as BGRBGRBGRBGR.
PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
PlanarTo24bVector128(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
}
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
private static void PlanarTo24bSse41(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
private static void PlanarTo24bVector128(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
{
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
@ -612,7 +613,7 @@ internal static class YuvConversion
// r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
// Process R.
ChannelMixing(
ChannelMixingVector128(
input0,
input1,
Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5), // PlanarTo24Shuffle0
@ -627,7 +628,7 @@ internal static class YuvConversion
// Process G.
// Same as before, just shifted to the left by one and including the right padding.
ChannelMixing(
ChannelMixingVector128(
input2,
input3,
Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255), // PlanarTo24Shuffle3
@ -641,7 +642,7 @@ internal static class YuvConversion
out Vector128<byte> g5);
// Process B.
ChannelMixing(
ChannelMixingVector128(
input4,
input5,
Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255), // PlanarTo24Shuffle6
@ -655,24 +656,24 @@ internal static class YuvConversion
out Vector128<byte> b5);
// OR the different channels.
Vector128<byte> rg0 = Sse2.Or(r0, g0);
Vector128<byte> rg1 = Sse2.Or(r1, g1);
Vector128<byte> rg2 = Sse2.Or(r2, g2);
Vector128<byte> rg3 = Sse2.Or(r3, g3);
Vector128<byte> rg4 = Sse2.Or(r4, g4);
Vector128<byte> rg5 = Sse2.Or(r5, g5);
Vector128<byte> rg0 = r0 | g0;
Vector128<byte> rg1 = r1 | g1;
Vector128<byte> rg2 = r2 | g2;
Vector128<byte> rg3 = r3 | g3;
Vector128<byte> rg4 = r4 | g4;
Vector128<byte> rg5 = r5 | g5;
ref byte outputRef = ref MemoryMarshal.GetReference(rgb);
Unsafe.As<byte, Vector128<byte>>(ref outputRef) = Sse2.Or(rg0, b0);
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1);
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2);
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3);
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4);
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5);
Unsafe.As<byte, Vector128<byte>>(ref outputRef) = rg0 | b0;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = rg1 | b1;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = rg2 | b2;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = rg3 | b3;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = rg4 | b4;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = rg5 | b5;
}
// Shuffles the input buffer as A0 0 0 A1 0 0 A2
private static void ChannelMixing(
private static void ChannelMixingVector128(
Vector128<byte> input0,
Vector128<byte> input1,
Vector128<byte> shuffle0,
@ -685,53 +686,53 @@ internal static class YuvConversion
out Vector128<byte> output4,
out Vector128<byte> output5)
{
output0 = Ssse3.Shuffle(input0, shuffle0);
output1 = Ssse3.Shuffle(input0, shuffle1);
output2 = Ssse3.Shuffle(input0, shuffle2);
output3 = Ssse3.Shuffle(input1, shuffle0);
output4 = Ssse3.Shuffle(input1, shuffle1);
output5 = Ssse3.Shuffle(input1, shuffle2);
output0 = Vector128_.ShuffleNative(input0, shuffle0);
output1 = Vector128_.ShuffleNative(input0, shuffle1);
output2 = Vector128_.ShuffleNative(input0, shuffle2);
output3 = Vector128_.ShuffleNative(input1, shuffle0);
output4 = Vector128_.ShuffleNative(input1, shuffle1);
output5 = Vector128_.ShuffleNative(input1, shuffle2);
}
// Convert 32 samples of YUV444 to B/G/R
private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
private static void ConvertYuv444ToBgrVector128(ref byte y, ref byte u, ref byte v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
{
// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
Vector128<byte> y0 = Unsafe.As<byte, Vector128<byte>>(ref y);
Vector128<byte> u0 = Unsafe.As<byte, Vector128<byte>>(ref u);
Vector128<byte> v0 = Unsafe.As<byte, Vector128<byte>>(ref v);
y0 = Sse2.UnpackLow(Vector128<byte>.Zero, y0);
u0 = Sse2.UnpackLow(Vector128<byte>.Zero, u0);
v0 = Sse2.UnpackLow(Vector128<byte>.Zero, v0);
y0 = Vector128_.UnpackLow(Vector128<byte>.Zero, y0);
u0 = Vector128_.UnpackLow(Vector128<byte>.Zero, u0);
v0 = Vector128_.UnpackLow(Vector128<byte>.Zero, v0);
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
// R = (19077 * y + 26149 * v - 14234) >> 6
// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
// B = (19077 * y + 33050 * u - 17685) >> 6
var k19077 = Vector128.Create((ushort)19077);
var k26149 = Vector128.Create((ushort)26149);
var k14234 = Vector128.Create((ushort)14234);
Vector128<ushort> k19077 = Vector128.Create((ushort)19077);
Vector128<ushort> k26149 = Vector128.Create((ushort)26149);
Vector128<ushort> k14234 = Vector128.Create((ushort)14234);
Vector128<ushort> y1 = Sse2.MultiplyHigh(y0.AsUInt16(), k19077);
Vector128<ushort> r0 = Sse2.MultiplyHigh(v0.AsUInt16(), k26149);
Vector128<ushort> g0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419));
Vector128<ushort> g1 = Sse2.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320));
Vector128<ushort> y1 = Vector128_.MultiplyHigh(y0.AsUInt16(), k19077);
Vector128<ushort> r0 = Vector128_.MultiplyHigh(v0.AsUInt16(), k26149);
Vector128<ushort> g0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419));
Vector128<ushort> g1 = Vector128_.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320));
Vector128<ushort> r1 = Sse2.Subtract(y1.AsUInt16(), k14234);
Vector128<ushort> r2 = Sse2.Add(r1, r0);
Vector128<ushort> r1 = y1.AsUInt16() - k14234;
Vector128<ushort> r2 = r1 + r0;
Vector128<ushort> g2 = Sse2.Add(y1.AsUInt16(), Vector128.Create((ushort)8708));
Vector128<ushort> g3 = Sse2.Add(g0, g1);
Vector128<ushort> g4 = Sse2.Subtract(g2, g3);
Vector128<ushort> g2 = y1.AsUInt16() + Vector128.Create((ushort)8708);
Vector128<ushort> g3 = g0 + g1;
Vector128<ushort> g4 = g2 - g3;
Vector128<ushort> b0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16());
Vector128<ushort> b1 = Sse2.AddSaturate(b0, y1);
Vector128<ushort> b2 = Sse2.SubtractSaturate(b1, Vector128.Create((ushort)17685));
Vector128<ushort> b0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16());
Vector128<ushort> b1 = Vector128_.AddSaturate(b0, y1);
Vector128<ushort> b2 = Vector128_.SubtractSaturate(b1, Vector128.Create((ushort)17685));
// Use logical shift for B2, which can be larger than 32767.
r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
r = Vector128.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
g = Vector128.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
b = Vector128.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
}
[MethodImpl(InliningOptions.ShortMethod)]

88
src/ImageSharp/Formats/Webp/WebpCommonUtils.cs

@ -3,7 +3,7 @@
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Formats.Webp;
@ -20,7 +20,7 @@ internal static class WebpCommonUtils
/// <returns>Returns true if alpha has non-0xff values.</returns>
public static unsafe bool CheckNonOpaque(ReadOnlySpan<Bgra32> row)
{
if (Avx2.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
ReadOnlySpan<byte> rowBytes = MemoryMarshal.AsBytes(row);
int i = 0;
@ -32,20 +32,20 @@ internal static class WebpCommonUtils
for (; i + 128 <= length; i += 128)
{
Vector256<byte> a0 = Avx.LoadVector256(src + i).AsByte();
Vector256<byte> a1 = Avx.LoadVector256(src + i + 32).AsByte();
Vector256<byte> a2 = Avx.LoadVector256(src + i + 64).AsByte();
Vector256<byte> a3 = Avx.LoadVector256(src + i + 96).AsByte();
Vector256<int> b0 = Avx2.And(a0, alphaMaskVector256).AsInt32();
Vector256<int> b1 = Avx2.And(a1, alphaMaskVector256).AsInt32();
Vector256<int> b2 = Avx2.And(a2, alphaMaskVector256).AsInt32();
Vector256<int> b3 = Avx2.And(a3, alphaMaskVector256).AsInt32();
Vector256<short> c0 = Avx2.PackSignedSaturate(b0, b1).AsInt16();
Vector256<short> c1 = Avx2.PackSignedSaturate(b2, b3).AsInt16();
Vector256<byte> d = Avx2.PackSignedSaturate(c0, c1).AsByte();
Vector256<byte> bits = Avx2.CompareEqual(d, all0x80Vector256);
int mask = Avx2.MoveMask(bits);
if (mask != -1)
Vector256<byte> a0 = Vector256.Load(src + i).AsByte();
Vector256<byte> a1 = Vector256.Load(src + i + 32).AsByte();
Vector256<byte> a2 = Vector256.Load(src + i + 64).AsByte();
Vector256<byte> a3 = Vector256.Load(src + i + 96).AsByte();
Vector256<int> b0 = (a0 & alphaMaskVector256).AsInt32();
Vector256<int> b1 = (a1 & alphaMaskVector256).AsInt32();
Vector256<int> b2 = (a2 & alphaMaskVector256).AsInt32();
Vector256<int> b3 = (a3 & alphaMaskVector256).AsInt32();
Vector256<short> c0 = Vector256_.PackSignedSaturate(b0, b1).AsInt16();
Vector256<short> c1 = Vector256_.PackSignedSaturate(b2, b3).AsInt16();
Vector256<byte> d = Vector256_.PackSignedSaturate(c0, c1).AsByte();
Vector256<byte> bits = Vector256.Equals(d, all0x80Vector256);
uint mask = bits.ExtractMostSignificantBits();
if (mask != 0xFFFF_FFFF)
{
return true;
}
@ -53,7 +53,7 @@ internal static class WebpCommonUtils
for (; i + 64 <= length; i += 64)
{
if (IsNoneOpaque64Bytes(src, i))
if (IsNoneOpaque64BytesVector128(src, i))
{
return true;
}
@ -61,7 +61,7 @@ internal static class WebpCommonUtils
for (; i + 32 <= length; i += 32)
{
if (IsNoneOpaque32Bytes(src, i))
if (IsNonOpaque32BytesVector128(src, i))
{
return true;
}
@ -76,7 +76,7 @@ internal static class WebpCommonUtils
}
}
}
else if (Sse2.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
ReadOnlySpan<byte> rowBytes = MemoryMarshal.AsBytes(row);
int i = 0;
@ -85,7 +85,7 @@ internal static class WebpCommonUtils
{
for (; i + 64 <= length; i += 64)
{
if (IsNoneOpaque64Bytes(src, i))
if (IsNoneOpaque64BytesVector128(src, i))
{
return true;
}
@ -93,7 +93,7 @@ internal static class WebpCommonUtils
for (; i + 32 <= length; i += 32)
{
if (IsNoneOpaque32Bytes(src, i))
if (IsNonOpaque32BytesVector128(src, i))
{
return true;
}
@ -122,38 +122,38 @@ internal static class WebpCommonUtils
return false;
}
private static unsafe bool IsNoneOpaque64Bytes(byte* src, int i)
private static unsafe bool IsNoneOpaque64BytesVector128(byte* src, int i)
{
Vector128<byte> alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
Vector128<byte> a2 = Sse2.LoadVector128(src + i + 32).AsByte();
Vector128<byte> a3 = Sse2.LoadVector128(src + i + 48).AsByte();
Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
Vector128<int> b2 = Sse2.And(a2, alphaMask).AsInt32();
Vector128<int> b3 = Sse2.And(a3, alphaMask).AsInt32();
Vector128<short> c0 = Sse2.PackSignedSaturate(b0, b1).AsInt16();
Vector128<short> c1 = Sse2.PackSignedSaturate(b2, b3).AsInt16();
Vector128<byte> d = Sse2.PackSignedSaturate(c0, c1).AsByte();
Vector128<byte> bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte());
int mask = Sse2.MoveMask(bits);
Vector128<byte> a0 = Vector128.Load(src + i).AsByte();
Vector128<byte> a1 = Vector128.Load(src + i + 16).AsByte();
Vector128<byte> a2 = Vector128.Load(src + i + 32).AsByte();
Vector128<byte> a3 = Vector128.Load(src + i + 48).AsByte();
Vector128<int> b0 = (a0 & alphaMask).AsInt32();
Vector128<int> b1 = (a1 & alphaMask).AsInt32();
Vector128<int> b2 = (a2 & alphaMask).AsInt32();
Vector128<int> b3 = (a3 & alphaMask).AsInt32();
Vector128<short> c0 = Vector128_.PackSignedSaturate(b0, b1).AsInt16();
Vector128<short> c1 = Vector128_.PackSignedSaturate(b2, b3).AsInt16();
Vector128<byte> d = Vector128_.PackSignedSaturate(c0, c1).AsByte();
Vector128<byte> bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte());
uint mask = bits.ExtractMostSignificantBits();
return mask != 0xFFFF;
}
private static unsafe bool IsNoneOpaque32Bytes(byte* src, int i)
private static unsafe bool IsNonOpaque32BytesVector128(byte* src, int i)
{
Vector128<byte> alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
Vector128<short> c = Sse2.PackSignedSaturate(b0, b1).AsInt16();
Vector128<byte> d = Sse2.PackSignedSaturate(c, c).AsByte();
Vector128<byte> bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte());
int mask = Sse2.MoveMask(bits);
Vector128<byte> a0 = Vector128.Load(src + i).AsByte();
Vector128<byte> a1 = Vector128.Load(src + i + 16).AsByte();
Vector128<int> b0 = (a0 & alphaMask).AsInt32();
Vector128<int> b1 = (a1 & alphaMask).AsInt32();
Vector128<short> c = Vector128_.PackSignedSaturate(b0, b1).AsInt16();
Vector128<byte> d = Vector128_.PackSignedSaturate(c, c).AsByte();
Vector128<byte> bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte());
uint mask = bits.ExtractMostSignificantBits();
return mask != 0xFFFF;
}
}

8
tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs

@ -71,17 +71,17 @@ public class ColorSpaceTransformUtilsTests
public void CollectColorBlueTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.AllowAll);
[Fact]
public void CollectColorBlueTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);
public void CollectColorBlueTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);
[Fact]
public void CollectColorBlueTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);
public void CollectColorBlueTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);
[Fact]
public void CollectColorRedTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.AllowAll);
[Fact]
public void CollectColorRedTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);
public void CollectColorRedTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);
[Fact]
public void CollectColorRedTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
public void CollectColorRedTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
}

Loading…
Cancel
Save