Browse Source

Merge branch 'main' into main

pull/2940/head
James Jackson-South 8 months ago
committed by GitHub
parent
commit
33c055d125
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 17
      src/ImageSharp/Common/Helpers/Numerics.cs
  2. 57
      src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
  3. 1092
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  4. 315
      src/ImageSharp/Common/Helpers/Vector256Utilities.cs
  5. 96
      src/ImageSharp/Common/Helpers/Vector512Utilities.cs
  6. 1
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.RgbScalar.cs
  7. 118
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykScalar.cs
  8. 99
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector128.cs
  9. 99
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector256.cs
  10. 108
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector512.cs
  11. 153
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKScalar.cs
  12. 131
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector128.cs
  13. 131
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector256.cs
  14. 142
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector512.cs
  15. 6
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKScalar.cs
  16. 10
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs
  17. 77
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverterBase.cs
  18. 3
      src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
  19. 3
      src/ImageSharp/Formats/Jpeg/Components/Encoder/SpectralConverter{TPixel}.cs
  20. 10
      src/ImageSharp/Formats/Jpeg/Components/JpegColorSpace.cs
  21. 18
      src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
  22. 9
      src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs
  23. 8
      src/ImageSharp/Formats/Tiff/Compression/Decompressors/OldJpegTiffCompression.cs
  24. 29
      src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffJpegSpectralConverter{TPixel}.cs
  25. 5
      src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffOldJpegSpectralConverter{TPixel}.cs
  26. 6
      src/ImageSharp/Formats/Tiff/Compression/TiffDecompressorsFactory.cs
  27. 23
      src/ImageSharp/Formats/Tiff/PhotometricInterpretation/CmykTiffColor{TPixel}.cs
  28. 4
      src/ImageSharp/Formats/Tiff/PhotometricInterpretation/TiffColorDecoderFactory{TPixel}.cs
  29. 25
      src/ImageSharp/Formats/Tiff/TiffDecoderCore.cs
  30. 8
      src/ImageSharp/Formats/Tiff/TiffDecoderMetadataCreator.cs
  31. 7
      src/ImageSharp/Formats/Webp/AlphaDecoder.cs
  32. 145
      src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
  33. 196
      src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
  34. 1273
      src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
  35. 355
      src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
  36. 209
      src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
  37. 88
      src/ImageSharp/Formats/Webp/WebpCommonUtils.cs
  38. 1
      src/ImageSharp/Metadata/Profiles/IPTC/IptcProfile.cs
  39. 2
      src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs
  40. 32
      tests/ImageSharp.Tests/Formats/Tiff/TiffDecoderTests.cs
  41. 8
      tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs
  42. 3
      tests/ImageSharp.Tests/TestImages.cs
  43. 3
      tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_Cmyk_Rgba32_Cmyk-jpeg.png
  44. 3
      tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_A.png
  45. 3
      tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_B.png
  46. 3
      tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_A.png
  47. 3
      tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_B.png
  48. 3
      tests/Images/Input/Tiff/Cmyk-jpeg.tiff
  49. 3
      tests/Images/Input/Tiff/Cmyk-planar-jpg.tiff
  50. 3
      tests/Images/Input/Tiff/Issues/Issue2454_A.tif
  51. 3
      tests/Images/Input/Tiff/Issues/Issue2454_B.tif

17
src/ImageSharp/Common/Helpers/Numerics.cs

@ -884,23 +884,6 @@ internal static class Numerics
accumulator += intHigh;
}
/// <summary>
/// Reduces elements of the vector into one sum.
/// </summary>
/// <param name="accumulator">The accumulator to reduce.</param>
/// <returns>The sum of all elements.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int ReduceSum(Vector128<int> accumulator)
{
// Add odd to even.
Vector128<int> vsum = Sse2.Add(accumulator, Sse2.Shuffle(accumulator, 0b_11_11_01_01));
// Add high to low.
vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10));
return Sse2.ConvertToInt32(vsum);
}
/// <summary>
/// Reduces elements of the vector into one sum.
/// </summary>

57
src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

@ -66,9 +66,9 @@ internal static partial class SimdUtils
ref Span<float> destination,
[ConstantExpected] byte control)
{
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) ||
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) ||
Vector128.IsHardwareAccelerated)
if (Vector512.IsHardwareAccelerated ||
Vector256.IsHardwareAccelerated ||
Vector128.IsHardwareAccelerated)
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@ -112,9 +112,9 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) ||
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) ||
(Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte))
if (Vector512.IsHardwareAccelerated ||
Vector256.IsHardwareAccelerated ||
Vector128.IsHardwareAccelerated)
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@ -158,7 +158,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight)
if (Vector128.IsHardwareAccelerated)
{
int remainder = source.Length % (Vector128<byte>.Count * 3);
@ -190,7 +190,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated)
{
int remainder = source.Length % (Vector128<byte>.Count * 3);
@ -223,7 +223,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated)
{
int remainder = source.Length & ((Vector128<byte>.Count * 4) - 1); // bit-hack for modulo
@ -249,7 +249,7 @@ internal static partial class SimdUtils
Span<float> destination,
[ConstantExpected] byte control)
{
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat)
if (Vector512.IsHardwareAccelerated)
{
ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
@ -277,7 +277,7 @@ internal static partial class SimdUtils
}
}
}
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat)
else if (Vector256.IsHardwareAccelerated)
{
ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@ -341,7 +341,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte)
if (Vector512.IsHardwareAccelerated)
{
Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -373,8 +373,13 @@ internal static partial class SimdUtils
}
}
}
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte)
else if (Vector256.IsHardwareAccelerated)
{
// ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb).
// MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte,
// so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F)
// for indexing within each lane, and ignores the upper bits unless bit 7 is set,
// this usage is guaranteed to remain within-lane and non-zeroing.
Span<byte> temp = stackalloc byte[Vector256<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
Vector256<byte> mask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(temp));
@ -391,21 +396,21 @@ internal static partial class SimdUtils
ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector256_.ShuffleNative(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
vd0 = Vector256_.ShufflePerLane(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
Unsafe.Add(ref destinationBase, i) = Vector256_.ShufflePerLane(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)
else if (Vector128.IsHardwareAccelerated)
{
Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -445,9 +450,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated &&
Vector128_.SupportsShuffleNativeByte &&
Vector128_.SupportsAlignRight)
if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
@ -507,10 +510,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated &&
Vector128_.SupportsShuffleNativeByte &&
Vector128_.SupportsShiftByte &&
Vector128_.SupportsAlignRight)
if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@ -553,10 +553,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated &&
Vector128_.SupportsShuffleNativeByte &&
Vector128_.SupportsShiftByte &&
Vector128_.SupportsAlignRight)
if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);

1092
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

File diff suppressed because it is too large

315
src/ImageSharp/Common/Helpers/Vector256Utilities.cs

@ -1,7 +1,6 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers;
internal static class Vector256_
#pragma warning restore SA1649 // File name should match first type name
{
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
/// </summary>
public static bool SupportsShuffleNativeFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx.IsSupported;
}
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
/// </summary>
public static bool SupportsShuffleNativeByte
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx2.IsSupported;
}
/// <summary>
/// Creates a new vector by selecting values from an input vector using a set of indices.
/// </summary>
@ -47,15 +28,7 @@ internal static class Vector256_
/// <returns>The <see cref="Vector256{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> ShuffleNative(Vector256<float> vector, [ConstantExpected] byte control)
{
if (Avx.IsSupported)
{
return Avx.Shuffle(vector, vector, control);
}
ThrowUnreachableException();
return default;
}
=> Avx.Shuffle(vector, vector, control);
/// <summary>
/// Creates a new vector by selecting values from an input vector using a set of indices.</summary>
@ -66,15 +39,17 @@ internal static class Vector256_
/// </param>
/// <returns>The <see cref="Vector256{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> ShuffleNative(Vector256<byte> vector, Vector256<byte> indices)
public static Vector256<byte> ShufflePerLane(Vector256<byte> vector, Vector256<byte> indices)
{
if (Avx2.IsSupported)
{
return Avx2.Shuffle(vector, indices);
}
ThrowUnreachableException();
return default;
Vector128<byte> indicesLo = indices.GetLower();
Vector128<byte> lower = Vector128_.ShuffleNative(vector.GetLower(), indicesLo);
Vector128<byte> upper = Vector128_.ShuffleNative(vector.GetUpper(), indicesLo);
return Vector256.Create(lower, upper);
}
/// <summary>
@ -162,6 +137,54 @@ internal static class Vector256_
return (vm0 * vm1) - vs;
}
/// <summary>
/// Multiply packed signed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and
/// pack the results.
/// </summary>
/// <param name="left">
/// The first vector containing packed signed 16-bit integers to multiply and add.
/// </param>
/// <param name="right">
/// The second vector containing packed signed 16-bit integers to multiply and add.
/// </param>
/// <returns>
/// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<int> MultiplyAddAdjacent(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.MultiplyAddAdjacent(left, right);
}
return Vector256.Create(
Vector128_.MultiplyAddAdjacent(left.GetLower(), right.GetLower()),
Vector128_.MultiplyAddAdjacent(left.GetUpper(), right.GetUpper()));
}
/// <summary>
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
/// </summary>
/// <param name="left">The left hand source vector.</param>
/// <param name="right">The right hand source vector.</param>
/// <returns>The <see cref="Vector256{UInt16}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<ushort> PackUnsignedSaturate(Vector256<int> left, Vector256<int> right)
{
if (Avx2.IsSupported)
{
return Avx2.PackUnsignedSaturate(left, right);
}
Vector256<int> min = Vector256.Create((int)ushort.MinValue);
Vector256<int> max = Vector256.Create((int)ushort.MaxValue);
Vector256<uint> lefClamped = Clamp(left, min, max).AsUInt32();
Vector256<uint> rightClamped = Clamp(right, min, max).AsUInt32();
return Vector256.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
/// </summary>
@ -183,6 +206,27 @@ internal static class Vector256_
return Vector256.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Packs signed 16-bit integers to signed 8-bit integers and saturates.
/// </summary>
/// <param name="left">The left hand source vector.</param>
/// <param name="right">The right hand source vector.</param>
/// <returns>The <see cref="Vector256{SByte}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<sbyte> PackSignedSaturate(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.PackSignedSaturate(left, right);
}
Vector256<short> min = Vector256.Create((short)sbyte.MinValue);
Vector256<short> max = Vector256.Create((short)sbyte.MaxValue);
Vector256<short> lefClamped = Clamp(left, min, max);
Vector256<short> rightClamped = Clamp(right, min, max);
return Vector256.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Restricts a vector between a minimum and a maximum value.
/// </summary>
@ -210,6 +254,211 @@ internal static class Vector256_
return Vector256.WidenLower(value.ToVector256());
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
/// <summary>
/// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
/// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 16-bit integers to multiply.
/// </param>
/// <param name="right">
/// The second vector containing packed 16-bit integers to multiply.
/// </param>
/// <returns>
/// A vector containing the low 16 bits of the products of the packed 16-bit integers
/// from <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<short> MultiplyLow(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.MultiplyLow(left, right);
}
// Widen each half of the short vectors into two int vectors
(Vector256<int> leftLower, Vector256<int> leftUpper) = Vector256.Widen(left);
(Vector256<int> rightLower, Vector256<int> rightUpper) = Vector256.Widen(right);
// Elementwise multiply: each int lane now holds the full 32-bit product
Vector256<int> prodLo = leftLower * rightLower;
Vector256<int> prodHi = leftUpper * rightUpper;
// Narrow the two int vectors back into one short vector
return Vector256.Narrow(prodLo, prodHi);
}
/// <summary>
/// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
/// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 16-bit integers to multiply.
/// </param>
/// <param name="right">
/// The second vector containing packed 16-bit integers to multiply.
/// </param>
/// <returns>
/// A vector containing the high 16 bits of the products of the packed 16-bit integers
/// from <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<short> MultiplyHigh(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.MultiplyHigh(left, right);
}
// Widen each half of the short vectors into two int vectors
(Vector256<int> leftLower, Vector256<int> leftUpper) = Vector256.Widen(left);
(Vector256<int> rightLower, Vector256<int> rightUpper) = Vector256.Widen(right);
// Elementwise multiply: each int lane now holds the full 32-bit product
Vector256<int> prodLo = leftLower * rightLower;
Vector256<int> prodHi = leftUpper * rightUpper;
// Arithmetic shift right by 16 bits to extract the high word
prodLo >>= 16;
prodHi >>= 16;
// Narrow the two int vectors back into one short vector
return Vector256.Narrow(prodLo, prodHi);
}
/// <summary>
/// Unpack and interleave 32-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 32-bit integers to unpack from the low half.
/// </param>
/// <param name="right">
/// The second vector containing packed 32-bit integers to unpack from the low half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 32-bit integers from the low
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<int> UnpackLow(Vector256<int> left, Vector256<int> right)
{
if (Avx2.IsSupported)
{
return Avx2.UnpackLow(left, right);
}
Vector128<int> lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
Vector128<int> hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
return Vector256.Create(lo, hi);
}
/// <summary>
/// Unpack and interleave 8-bit integers from the high half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 8-bit integers to unpack from the high half.
/// </param>
/// <param name="right">
/// The second vector containing packed 8-bit integers to unpack from the high half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 8-bit integers from the high
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> UnpackHigh(Vector256<byte> left, Vector256<byte> right)
{
if (Avx2.IsSupported)
{
return Avx2.UnpackHigh(left, right);
}
Vector128<byte> lo = Vector128_.UnpackHigh(left.GetLower(), right.GetLower());
Vector128<byte> hi = Vector128_.UnpackHigh(left.GetUpper(), right.GetUpper());
return Vector256.Create(lo, hi);
}
/// <summary>
/// Unpack and interleave 8-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 8-bit integers to unpack from the low half.
/// </param>
/// <param name="right">
/// The second vector containing packed 8-bit integers to unpack from the low half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 8-bit integers from the low
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> UnpackLow(Vector256<byte> left, Vector256<byte> right)
{
if (Avx2.IsSupported)
{
return Avx2.UnpackLow(left, right);
}
Vector128<byte> lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
Vector128<byte> hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
return Vector256.Create(lo, hi);
}
/// <summary>
/// Subtract packed signed 16-bit integers in <paramref name="right"/> from packed signed 16-bit integers
/// in <paramref name="left"/> using saturation, and store the results.
/// </summary>
/// <param name="left">
/// The first vector containing packed signed 16-bit integers to subtract from.
/// </param>
/// <param name="right">
/// The second vector containing packed signed 16-bit integers to subtract.
/// </param>
/// <returns>
/// A vector containing the results of subtracting packed unsigned 16-bit integers
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<short> SubtractSaturate(Vector256<short> left, Vector256<short> right)
{
if (Avx2.IsSupported)
{
return Avx2.SubtractSaturate(left, right);
}
return Vector256.Create(
Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
}
/// <summary>
/// Subtract packed unsigned 8-bit integers in <paramref name="right"/> from packed unsigned 8-bit integers
/// in <paramref name="left"/> using saturation, and store the results.
/// </summary>
/// <param name="left">
/// The first vector containing packed unsigned 8-bit integers to subtract from.
/// </param>
/// <param name="right">
/// The second vector containing packed unsigned 8-bit integers to subtract.
/// </param>
/// <returns>
/// A vector containing the results of subtracting packed unsigned 8-bit integers
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> SubtractSaturate(Vector256<byte> left, Vector256<byte> right)
{
if (Avx2.IsSupported)
{
return Avx2.SubtractSaturate(left, right);
}
return Vector256.Create(
Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
}
}

96
src/ImageSharp/Common/Helpers/Vector512Utilities.cs

@ -1,7 +1,6 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers;
internal static class Vector512_
#pragma warning restore SA1649 // File name should match first type name
{
/// <summary>
/// Gets a value indicating whether shuffle float operations are supported.
/// </summary>
public static bool SupportsShuffleNativeFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx512F.IsSupported;
}
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
/// </summary>
public static bool SupportsShuffleNativeByte
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx512BW.IsSupported;
}
/// <summary>
/// Creates a new vector by selecting values from an input vector using the control.
/// </summary>
@ -47,15 +28,7 @@ internal static class Vector512_
/// <returns>The <see cref="Vector512{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<float> ShuffleNative(Vector512<float> vector, [ConstantExpected] byte control)
{
if (Avx512F.IsSupported)
{
return Avx512F.Shuffle(vector, vector, control);
}
ThrowUnreachableException();
return default;
}
=> Avx512F.Shuffle(vector, vector, control);
/// <summary>
/// Creates a new vector by selecting values from an input vector using a set of indices.
@ -73,8 +46,7 @@ internal static class Vector512_
return Avx512BW.Shuffle(vector, indices);
}
ThrowUnreachableException();
return default;
return Vector512.Shuffle(vector, indices);
}
/// <summary>
@ -85,25 +57,7 @@ internal static class Vector512_
/// <returns>The <see cref="Vector128{Int32}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
{
if (Avx512F.IsSupported)
{
return Avx512F.ConvertToVector512Int32(vector);
}
if (Avx.IsSupported)
{
Vector256<int> lower = Avx.ConvertToVector256Int32(vector.GetLower());
Vector256<int> upper = Avx.ConvertToVector256Int32(vector.GetUpper());
return Vector512.Create(lower, upper);
}
Vector512<float> sign = vector & Vector512.Create(-0.0f);
Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608.0f);
val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
return Vector512.ConvertToInt32(val_2p23_f32 | sign);
}
=> Avx512F.ConvertToVector512Int32(vector);
/// <summary>
/// Rounds all values in <paramref name="vector"/> to the nearest integer
@ -112,28 +66,11 @@ internal static class Vector512_
/// <param name="vector">The vector</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<float> RoundToNearestInteger(Vector512<float> vector)
{
if (Avx512F.IsSupported)
{
// imm8 = 0b1000:
// imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers)
// imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions)
return Avx512F.RoundScale(vector, 0b0000_1000);
}
if (Avx.IsSupported)
{
Vector256<float> lower = Avx.RoundToNearestInteger(vector.GetLower());
Vector256<float> upper = Avx.RoundToNearestInteger(vector.GetUpper());
return Vector512.Create(lower, upper);
}
Vector512<float> sign = vector & Vector512.Create(-0F);
Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608F);
val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
return val_2p23_f32 | sign;
}
// imm8 = 0b1000:
// imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers)
// imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions)
=> Avx512F.RoundScale(vector, 0b0000_1000);
/// <summary>
/// Performs a multiplication and an addition of the <see cref="Vector512{Single}"/>.
@ -148,21 +85,7 @@ internal static class Vector512_
Vector512<float> va,
Vector512<float> vm0,
Vector512<float> vm1)
{
if (Avx512F.IsSupported)
{
return Avx512F.FusedMultiplyAdd(vm0, vm1, va);
}
if (Fma.IsSupported)
{
Vector256<float> lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower());
Vector256<float> upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper());
return Vector512.Create(lower, upper);
}
return va + (vm0 * vm1);
}
=> Avx512F.FusedMultiplyAdd(vm0, vm1, va);
/// <summary>
/// Restricts a vector between a minimum and a maximum value.
@ -175,7 +98,4 @@ internal static class Vector512_
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<T> Clamp<T>(Vector512<T> value, Vector512<T> min, Vector512<T> max)
=> Vector512.Min(Vector512.Max(value, min), max);
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

1
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.RgbScalar.cs

@ -75,6 +75,7 @@ internal abstract partial class JpegColorConverterBase
internal static void ConvertFromRgb(ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
{
// TODO: This doesn't seem correct. We should be scaling to the maximum value here.
rLane.CopyTo(values.Component0);
gLane.CopyTo(values.Component1);
bLane.CopyTo(values.Component2);

118
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykScalar.cs

@ -0,0 +1,118 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Buffers;
using System.Numerics;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.ColorProfiles;
using SixLabors.ImageSharp.ColorProfiles.Icc;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal abstract partial class JpegColorConverterBase
{
/// <summary>
/// Color converter for tiff images, which use the jpeg compression and CMYK colorspace.
/// </summary>
internal sealed class TiffCmykScalar : JpegColorConverterScalar
{
public TiffCmykScalar(int precision)
: base(JpegColorSpace.TiffCmyk, precision)
{
}
/// <inheritdoc/>
public override void ConvertToRgbInPlace(in ComponentValues values)
=> ConvertToRgbInPlace(in values, this.MaximumValue);
/// <inheritdoc/>
public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
=> ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
=> ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane);
public static void ConvertToRgbInPlace(in ComponentValues values, float maxValue)
{
Span<float> c0 = values.Component0;
Span<float> c1 = values.Component1;
Span<float> c2 = values.Component2;
Span<float> c3 = values.Component3;
float scale = 1 / maxValue;
for (int i = 0; i < c0.Length; i++)
{
float c = c0[i] * scale;
float m = c1[i] * scale;
float y = c2[i] * scale;
float k = 1 - (c3[i] * scale);
c0[i] = (1 - c) * k;
c1[i] = (1 - m) * k;
c2[i] = (1 - y) * k;
}
}
public static void ConvertFromRgb(in ComponentValues values, float maxValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
{
Span<float> c = values.Component0;
Span<float> m = values.Component1;
Span<float> y = values.Component2;
Span<float> k = values.Component3;
for (int i = 0; i < c.Length; i++)
{
float ctmp = 255F - rLane[i];
float mtmp = 255F - gLane[i];
float ytmp = 255F - bLane[i];
float ktmp = MathF.Min(MathF.Min(ctmp, mtmp), ytmp);
if (ktmp >= 255F)
{
ctmp = 0F;
mtmp = 0F;
ytmp = 0F;
}
else
{
float divisor = 1 / (255F - ktmp);
ctmp = (ctmp - ktmp) * divisor;
mtmp = (mtmp - ktmp) * divisor;
ytmp = (ytmp - ktmp) * divisor;
}
c[i] = ctmp * maxValue;
m[i] = mtmp * maxValue;
y[i] = ytmp * maxValue;
k[i] = ktmp;
}
}
public static void ConvertToRgbInPlaceWithIcc(Configuration configuration, IccProfile profile, in ComponentValues values, float maxValue)
{
using IMemoryOwner<float> memoryOwner = configuration.MemoryAllocator.Allocate<float>(values.Component0.Length * 4);
Span<float> packed = memoryOwner.Memory.Span;
Span<float> c0 = values.Component0;
Span<float> c1 = values.Component1;
Span<float> c2 = values.Component2;
Span<float> c3 = values.Component3;
PackedNormalizeInterleave4(c0, c1, c2, c3, packed, maxValue);
Span<Cmyk> source = MemoryMarshal.Cast<float, Cmyk>(packed);
Span<Rgb> destination = MemoryMarshal.Cast<float, Rgb>(packed)[..source.Length];
ColorConversionOptions options = new()
{
SourceIccProfile = profile,
TargetIccProfile = CompactSrgbV4Profile.Profile,
};
ColorProfileConverter converter = new(options);
converter.Convert<Cmyk, Rgb>(source, destination);
UnpackDeinterleave3(MemoryMarshal.Cast<float, Vector3>(packed)[..source.Length], c0, c1, c2);
}
}
}

99
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector128.cs

@ -0,0 +1,99 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal abstract partial class JpegColorConverterBase
{
internal sealed class TiffCmykVector128 : JpegColorConverterVector128
{
public TiffCmykVector128(int precision)
: base(JpegColorSpace.TiffCmyk, precision)
{
}
/// <inheritdoc/>
public override void ConvertToRgbInPlace(in ComponentValues values)
{
ref Vector128<float> c0Base =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector128<float> c1Base =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector128<float> c2Base =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector128<float> c3Base =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component3));
Vector128<float> scale = Vector128.Create(1 / this.MaximumValue);
nuint n = values.Component0.Vector128Count<float>();
for (nuint i = 0; i < n; i++)
{
ref Vector128<float> c = ref Unsafe.Add(ref c0Base, i);
ref Vector128<float> m = ref Unsafe.Add(ref c1Base, i);
ref Vector128<float> y = ref Unsafe.Add(ref c2Base, i);
Vector128<float> k = Unsafe.Add(ref c3Base, i);
k = Vector128<float>.One - (k * scale);
c = (Vector128<float>.One - (c * scale)) * k;
m = (Vector128<float>.One - (m * scale)) * k;
y = (Vector128<float>.One - (y * scale)) * k;
}
}
/// <inheritdoc/>
public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
=> TiffCmykScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
/// <inheritdoc/>
public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
=> ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane);
public static void ConvertFromRgb(in ComponentValues values, float maxValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
{
ref Vector128<float> destC =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector128<float> destM =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector128<float> destY =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector128<float> destK =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component3));
ref Vector128<float> srcR =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(rLane));
ref Vector128<float> srcG =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(gLane));
ref Vector128<float> srcB =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(bLane));
Vector128<float> scale = Vector128.Create(maxValue);
nuint n = values.Component0.Vector128Count<float>();
for (nuint i = 0; i < n; i++)
{
Vector128<float> ctmp = scale - Unsafe.Add(ref srcR, i);
Vector128<float> mtmp = scale - Unsafe.Add(ref srcG, i);
Vector128<float> ytmp = scale - Unsafe.Add(ref srcB, i);
Vector128<float> ktmp = Vector128.Min(ctmp, Vector128.Min(mtmp, ytmp));
Vector128<float> kMask = ~Vector128.Equals(ktmp, scale);
Vector128<float> divisor = Vector128<float>.One / (scale - ktmp);
ctmp = ((ctmp - ktmp) * divisor) & kMask;
mtmp = ((mtmp - ktmp) * divisor) & kMask;
ytmp = ((ytmp - ktmp) * divisor) & kMask;
Unsafe.Add(ref destC, i) = ctmp * scale;
Unsafe.Add(ref destM, i) = mtmp * scale;
Unsafe.Add(ref destY, i) = ytmp * scale;
Unsafe.Add(ref destK, i) = ktmp;
}
}
}
}

99
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector256.cs

@ -0,0 +1,99 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal abstract partial class JpegColorConverterBase
{
internal sealed class TiffCmykVector256 : JpegColorConverterVector256
{
public TiffCmykVector256(int precision)
: base(JpegColorSpace.TiffCmyk, precision)
{
}
/// <inheritdoc/>
public override void ConvertToRgbInPlace(in ComponentValues values)
{
ref Vector256<float> c0Base =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector256<float> c1Base =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector256<float> c2Base =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector256<float> c3Base =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3));
Vector256<float> scale = Vector256.Create(1 / this.MaximumValue);
nuint n = values.Component0.Vector256Count<float>();
for (nuint i = 0; i < n; i++)
{
ref Vector256<float> c = ref Unsafe.Add(ref c0Base, i);
ref Vector256<float> m = ref Unsafe.Add(ref c1Base, i);
ref Vector256<float> y = ref Unsafe.Add(ref c2Base, i);
Vector256<float> k = Unsafe.Add(ref c3Base, i);
k = Vector256<float>.One - (k * scale);
c = (Vector256<float>.One - (c * scale)) * k;
m = (Vector256<float>.One - (m * scale)) * k;
y = (Vector256<float>.One - (y * scale)) * k;
}
}
/// <inheritdoc/>
public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
=> CmykScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
/// <inheritdoc/>
public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
=> ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane);
public static void ConvertFromRgb(in ComponentValues values, float maxValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
{
ref Vector256<float> destC =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector256<float> destM =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector256<float> destY =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector256<float> destK =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3));
ref Vector256<float> srcR =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(rLane));
ref Vector256<float> srcG =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(gLane));
ref Vector256<float> srcB =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(bLane));
Vector256<float> scale = Vector256.Create(maxValue);
nuint n = values.Component0.Vector256Count<float>();
for (nuint i = 0; i < n; i++)
{
Vector256<float> ctmp = scale - Unsafe.Add(ref srcR, i);
Vector256<float> mtmp = scale - Unsafe.Add(ref srcG, i);
Vector256<float> ytmp = scale - Unsafe.Add(ref srcB, i);
Vector256<float> ktmp = Vector256.Min(ctmp, Vector256.Min(mtmp, ytmp));
Vector256<float> kMask = ~Vector256.Equals(ktmp, scale);
Vector256<float> divisor = Vector256<float>.One / (scale - ktmp);
ctmp = ((ctmp - ktmp) * divisor) & kMask;
mtmp = ((mtmp - ktmp) * divisor) & kMask;
ytmp = ((ytmp - ktmp) * divisor) & kMask;
Unsafe.Add(ref destC, i) = ctmp * scale;
Unsafe.Add(ref destM, i) = mtmp * scale;
Unsafe.Add(ref destY, i) = ytmp * scale;
Unsafe.Add(ref destK, i) = ktmp;
}
}
}
}

108
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector512.cs

@ -0,0 +1,108 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal abstract partial class JpegColorConverterBase
{
internal sealed class TiffCmykVector512 : JpegColorConverterVector512
{
public TiffCmykVector512(int precision)
: base(JpegColorSpace.TiffCmyk, precision)
{
}
/// <inheritdoc/>
public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
=> TiffCmykScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
/// <inheritdoc/>
protected override void ConvertToRgbInPlaceVectorized(in ComponentValues values)
{
ref Vector512<float> c0Base =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector512<float> c1Base =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector512<float> c2Base =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector512<float> c3Base =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component3));
// Used for the color conversion
Vector512<float> scale = Vector512.Create(1 / this.MaximumValue);
nuint n = values.Component0.Vector512Count<float>();
for (nuint i = 0; i < n; i++)
{
ref Vector512<float> c = ref Unsafe.Add(ref c0Base, i);
ref Vector512<float> m = ref Unsafe.Add(ref c1Base, i);
ref Vector512<float> y = ref Unsafe.Add(ref c2Base, i);
Vector512<float> k = Unsafe.Add(ref c3Base, i);
k = Vector512<float>.One - (k * scale);
c = (Vector512<float>.One - (c * scale)) * k;
m = (Vector512<float>.One - (m * scale)) * k;
y = (Vector512<float>.One - (y * scale)) * k;
}
}
/// <inheritdoc/>
protected override void ConvertFromRgbVectorized(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
=> ConvertFromRgbVectorized(in values, this.MaximumValue, rLane, gLane, bLane);
/// <inheritdoc/>
protected override void ConvertToRgbInPlaceScalarRemainder(in ComponentValues values)
=> TiffCmykScalar.ConvertToRgbInPlace(values, this.MaximumValue);
/// <inheritdoc/>
protected override void ConvertFromRgbScalarRemainder(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
=> TiffCmykScalar.ConvertFromRgb(values, this.MaximumValue, rLane, gLane, bLane);
internal static void ConvertFromRgbVectorized(in ComponentValues values, float maxValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
{
ref Vector512<float> destC =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector512<float> destM =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector512<float> destY =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector512<float> destK =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component3));
ref Vector512<float> srcR =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(rLane));
ref Vector512<float> srcG =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(gLane));
ref Vector512<float> srcB =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(bLane));
Vector512<float> scale = Vector512.Create(maxValue);
nuint n = values.Component0.Vector512Count<float>();
for (nuint i = 0; i < n; i++)
{
Vector512<float> ctmp = scale - Unsafe.Add(ref srcR, i);
Vector512<float> mtmp = scale - Unsafe.Add(ref srcG, i);
Vector512<float> ytmp = scale - Unsafe.Add(ref srcB, i);
Vector512<float> ktmp = Vector512.Min(ctmp, Vector512.Min(mtmp, ytmp));
Vector512<float> kMask = ~Vector512.Equals(ktmp, scale);
Vector512<float> divisor = Vector512<float>.One / (scale - ktmp);
ctmp = ((ctmp - ktmp) * divisor) & kMask;
mtmp = ((mtmp - ktmp) * divisor) & kMask;
ytmp = ((ytmp - ktmp) * divisor) & kMask;
Unsafe.Add(ref destC, i) = ctmp * scale;
Unsafe.Add(ref destM, i) = mtmp * scale;
Unsafe.Add(ref destY, i) = ytmp * scale;
Unsafe.Add(ref destK, i) = ktmp;
}
}
}
}

153
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKScalar.cs

@ -0,0 +1,153 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Buffers;
using System.Numerics;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.ColorProfiles;
using SixLabors.ImageSharp.ColorProfiles.Icc;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal abstract partial class JpegColorConverterBase
{
/// <summary>
/// Color converter for tiff images, which use the jpeg compression and CMYK colorspace.
/// </summary>
internal sealed class TiffYccKScalar : JpegColorConverterScalar
{
// Derived from ITU-T Rec. T.871
internal const float RCrMult = 1.402f;
internal const float GCbMult = (float)(0.114 * 1.772 / 0.587);
internal const float GCrMult = (float)(0.299 * 1.402 / 0.587);
internal const float BCbMult = 1.772f;
public TiffYccKScalar(int precision)
: base(JpegColorSpace.TiffYccK, precision)
{
}
/// <inheritdoc/>
public override void ConvertToRgbInPlace(in ComponentValues values)
=> ConvertToRgbInPlace(in values, this.MaximumValue, this.HalfValue);
/// <inheritdoc/>
public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
=> ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
=> ConvertFromRgb(values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane);
public static void ConvertToRgbInPlace(in ComponentValues values, float maxValue, float halfValue)
{
Span<float> c0 = values.Component0;
Span<float> c1 = values.Component1;
Span<float> c2 = values.Component2;
Span<float> c3 = values.Component3;
float scale = 1F / maxValue;
halfValue *= scale;
for (int i = 0; i < values.Component0.Length; i++)
{
float y = c0[i] * scale;
float cb = (c1[i] * scale) - halfValue;
float cr = (c2[i] * scale) - halfValue;
float scaledK = 1 - (c3[i] * scale);
// r = y + (1.402F * cr);
// g = y - (0.344136F * cb) - (0.714136F * cr);
// b = y + (1.772F * cb);
c0[i] = (y + (RCrMult * cr)) * scaledK;
c1[i] = (y - (GCbMult * cb) - (GCrMult * cr)) * scaledK;
c2[i] = (y + (BCbMult * cb)) * scaledK;
}
}
public static void ConvertFromRgb(in ComponentValues values, float halfValue, float maxValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
{
Span<float> y = values.Component0;
Span<float> cb = values.Component1;
Span<float> cr = values.Component2;
Span<float> k = values.Component3;
for (int i = 0; i < cr.Length; i++)
{
// Scale down to [0-1]
const float divisor = 1F / 255F;
float r = rLane[i] * divisor;
float g = gLane[i] * divisor;
float b = bLane[i] * divisor;
float ytmp;
float cbtmp;
float crtmp;
float ktmp = 1F - MathF.Max(r, MathF.Max(g, b));
if (ktmp >= 1F)
{
ytmp = 0F;
cbtmp = 0.5F;
crtmp = 0.5F;
ktmp = maxValue;
}
else
{
float kmask = 1F / (1F - ktmp);
r *= kmask;
g *= kmask;
b *= kmask;
// Scale to [0-maxValue]
ytmp = ((0.299f * r) + (0.587f * g) + (0.114f * b)) * maxValue;
cbtmp = halfValue - (((0.168736f * r) - (0.331264f * g) + (0.5f * b)) * maxValue);
crtmp = halfValue + (((0.5f * r) - (0.418688f * g) - (0.081312f * b)) * maxValue);
ktmp *= maxValue;
}
y[i] = ytmp;
cb[i] = cbtmp;
cr[i] = crtmp;
k[i] = ktmp;
}
}
public static void ConvertToRgbInPlaceWithIcc(Configuration configuration, IccProfile profile, in ComponentValues values, float maxValue)
{
using IMemoryOwner<float> memoryOwner = configuration.MemoryAllocator.Allocate<float>(values.Component0.Length * 4);
Span<float> packed = memoryOwner.Memory.Span;
Span<float> c0 = values.Component0;
Span<float> c1 = values.Component1;
Span<float> c2 = values.Component2;
Span<float> c3 = values.Component3;
PackedNormalizeInterleave4(c0, c1, c2, c3, packed, maxValue);
ColorProfileConverter converter = new();
Span<Cmyk> source = MemoryMarshal.Cast<float, Cmyk>(packed);
// YccK is not a defined ICC color space — it's a JPEG-specific encoding used in Adobe-style CMYK JPEGs.
// ICC profiles expect colorimetric CMYK values, so we must first convert YccK to CMYK using a hardcoded inverse transform.
// This transform assumes Rec.601 YCbCr coefficients and an inverted K channel.
//
// The YccK => Cmyk conversion is independent of any embedded ICC profile.
// Since the same RGB working space is used during conversion to and from XYZ,
// colorimetric accuracy is preserved.
converter.Convert<YccK, Cmyk>(MemoryMarshal.Cast<Cmyk, YccK>(source), source);
Span<Rgb> destination = MemoryMarshal.Cast<float, Rgb>(packed)[..source.Length];
ColorConversionOptions options = new()
{
SourceIccProfile = profile,
TargetIccProfile = CompactSrgbV4Profile.Profile,
};
converter = new(options);
converter.Convert<Cmyk, Rgb>(source, destination);
UnpackDeinterleave3(MemoryMarshal.Cast<float, Vector3>(packed)[..source.Length], c0, c1, c2);
}
}
}

131
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector128.cs

@ -0,0 +1,131 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal abstract partial class JpegColorConverterBase
{
internal sealed class TiffYccKVector128 : JpegColorConverterVector128
{
public TiffYccKVector128(int precision)
: base(JpegColorSpace.TiffYccK, precision)
{
}
/// <inheritdoc/>
public override void ConvertToRgbInPlace(in ComponentValues values)
{
ref Vector128<float> c0Base =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector128<float> c1Base =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector128<float> c2Base =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector128<float> c3Base =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component3));
Vector128<float> scale = Vector128.Create(1F / this.MaximumValue);
Vector128<float> chromaOffset = Vector128.Create(this.HalfValue) * scale;
Vector128<float> rCrMult = Vector128.Create(YCbCrScalar.RCrMult);
Vector128<float> gCbMult = Vector128.Create(-YCbCrScalar.GCbMult);
Vector128<float> gCrMult = Vector128.Create(-YCbCrScalar.GCrMult);
Vector128<float> bCbMult = Vector128.Create(YCbCrScalar.BCbMult);
nuint n = values.Component0.Vector128Count<float>();
for (nuint i = 0; i < n; i++)
{
ref Vector128<float> c0 = ref Unsafe.Add(ref c0Base, i);
ref Vector128<float> c1 = ref Unsafe.Add(ref c1Base, i);
ref Vector128<float> c2 = ref Unsafe.Add(ref c2Base, i);
ref Vector128<float> c3 = ref Unsafe.Add(ref c3Base, i);
Vector128<float> y = c0 * scale;
Vector128<float> cb = (c1 * scale) - chromaOffset;
Vector128<float> cr = (c2 * scale) - chromaOffset;
Vector128<float> scaledK = Vector128<float>.One - (c3 * scale);
// r = y + (1.402F * cr);
// g = y - (0.344136F * cb) - (0.714136F * cr);
// b = y + (1.772F * cb);
Vector128<float> r = Vector128_.MultiplyAdd(y, cr, rCrMult) * scaledK;
Vector128<float> g = Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(y, cb, gCbMult), cr, gCrMult) * scaledK;
Vector128<float> b = Vector128_.MultiplyAdd(y, cb, bCbMult) * scaledK;
c0 = r;
c1 = g;
c2 = b;
}
}
/// <inheritdoc/>
public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
=> TiffYccKScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
/// <inheritdoc/>
public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
{
ref Vector128<float> srcR =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(rLane));
ref Vector128<float> srcG =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(gLane));
ref Vector128<float> srcB =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(bLane));
ref Vector128<float> destY =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector128<float> destCb =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector128<float> destCr =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector128<float> destK =
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component3));
Vector128<float> maxSourceValue = Vector128.Create(1 / 255F);
Vector128<float> maxSampleValue = Vector128.Create(this.MaximumValue);
Vector128<float> chromaOffset = Vector128.Create(this.HalfValue);
Vector128<float> f0299 = Vector128.Create(0.299f);
Vector128<float> f0587 = Vector128.Create(0.587f);
Vector128<float> f0114 = Vector128.Create(0.114f);
Vector128<float> fn0168736 = Vector128.Create(-0.168736f);
Vector128<float> fn0331264 = Vector128.Create(-0.331264f);
Vector128<float> fn0418688 = Vector128.Create(-0.418688f);
Vector128<float> fn0081312F = Vector128.Create(-0.081312F);
Vector128<float> f05 = Vector128.Create(0.5f);
nuint n = values.Component0.Vector128Count<float>();
for (nuint i = 0; i < n; i++)
{
Vector128<float> r = Unsafe.Add(ref srcR, i) * maxSourceValue;
Vector128<float> g = Unsafe.Add(ref srcG, i) * maxSourceValue;
Vector128<float> b = Unsafe.Add(ref srcB, i) * maxSourceValue;
Vector128<float> ktmp = Vector128<float>.One - Vector128.Max(r, Vector128.Min(g, b));
Vector128<float> kMask = ~Vector128.Equals(ktmp, Vector128<float>.One);
Vector128<float> divisor = Vector128<float>.One / (Vector128<float>.One - ktmp);
r = (r * divisor) & kMask;
g = (g * divisor) & kMask;
b = (b * divisor) & kMask;
// y = 0 + (0.299 * r) + (0.587 * g) + (0.114 * b)
// cb = 128 - (0.168736 * r) - (0.331264 * g) + (0.5 * b)
// cr = 128 + (0.5 * r) - (0.418688 * g) - (0.081312 * b)
Vector128<float> y = Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
Vector128<float> cb = chromaOffset + Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(f05 * b, fn0331264, g), fn0168736, r);
Vector128<float> cr = chromaOffset + Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(fn0081312F * b, fn0418688, g), f05, r);
Unsafe.Add(ref destY, i) = y * maxSampleValue;
Unsafe.Add(ref destCb, i) = chromaOffset + (cb * maxSampleValue);
Unsafe.Add(ref destCr, i) = chromaOffset + (cr * maxSampleValue);
Unsafe.Add(ref destK, i) = ktmp * maxSampleValue;
}
}
}
}

131
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector256.cs

@ -0,0 +1,131 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal abstract partial class JpegColorConverterBase
{
internal sealed class TiffYccKVector256 : JpegColorConverterVector256
{
public TiffYccKVector256(int precision)
: base(JpegColorSpace.TiffYccK, precision)
{
}
/// <inheritdoc/>
public override void ConvertToRgbInPlace(in ComponentValues values)
{
ref Vector256<float> c0Base =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector256<float> c1Base =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector256<float> c2Base =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector256<float> c3Base =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3));
Vector256<float> scale = Vector256.Create(1F / this.MaximumValue);
Vector256<float> chromaOffset = Vector256.Create(this.HalfValue) * scale;
Vector256<float> rCrMult = Vector256.Create(YCbCrScalar.RCrMult);
Vector256<float> gCbMult = Vector256.Create(-YCbCrScalar.GCbMult);
Vector256<float> gCrMult = Vector256.Create(-YCbCrScalar.GCrMult);
Vector256<float> bCbMult = Vector256.Create(YCbCrScalar.BCbMult);
nuint n = values.Component0.Vector256Count<float>();
for (nuint i = 0; i < n; i++)
{
ref Vector256<float> c0 = ref Unsafe.Add(ref c0Base, i);
ref Vector256<float> c1 = ref Unsafe.Add(ref c1Base, i);
ref Vector256<float> c2 = ref Unsafe.Add(ref c2Base, i);
ref Vector256<float> c3 = ref Unsafe.Add(ref c3Base, i);
Vector256<float> y = c0 * scale;
Vector256<float> cb = (c1 * scale) - chromaOffset;
Vector256<float> cr = (c2 * scale) - chromaOffset;
Vector256<float> scaledK = Vector256<float>.One - (c3 * scale);
// r = y + (1.402F * cr);
// g = y - (0.344136F * cb) - (0.714136F * cr);
// b = y + (1.772F * cb);
Vector256<float> r = Vector256_.MultiplyAdd(y, cr, rCrMult) * scaledK;
Vector256<float> g = Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(y, cb, gCbMult), cr, gCrMult) * scaledK;
Vector256<float> b = Vector256_.MultiplyAdd(y, cb, bCbMult) * scaledK;
c0 = r;
c1 = g;
c2 = b;
}
}
/// <inheritdoc/>
public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
=> TiffYccKScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
/// <inheritdoc/>
public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
{
ref Vector256<float> srcR =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(rLane));
ref Vector256<float> srcG =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(gLane));
ref Vector256<float> srcB =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(bLane));
ref Vector256<float> destY =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector256<float> destCb =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector256<float> destCr =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector256<float> destK =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3));
Vector256<float> maxSourceValue = Vector256.Create(255F);
Vector256<float> maxSampleValue = Vector256.Create(this.MaximumValue);
Vector256<float> chromaOffset = Vector256.Create(this.HalfValue);
Vector256<float> f0299 = Vector256.Create(0.299f);
Vector256<float> f0587 = Vector256.Create(0.587f);
Vector256<float> f0114 = Vector256.Create(0.114f);
Vector256<float> fn0168736 = Vector256.Create(-0.168736f);
Vector256<float> fn0331264 = Vector256.Create(-0.331264f);
Vector256<float> fn0418688 = Vector256.Create(-0.418688f);
Vector256<float> fn0081312F = Vector256.Create(-0.081312F);
Vector256<float> f05 = Vector256.Create(0.5f);
nuint n = values.Component0.Vector256Count<float>();
for (nuint i = 0; i < n; i++)
{
Vector256<float> r = Unsafe.Add(ref srcR, i) / maxSourceValue;
Vector256<float> g = Unsafe.Add(ref srcG, i) / maxSourceValue;
Vector256<float> b = Unsafe.Add(ref srcB, i) / maxSourceValue;
Vector256<float> ktmp = Vector256<float>.One - Vector256.Max(r, Vector256.Min(g, b));
Vector256<float> kMask = ~Vector256.Equals(ktmp, Vector256<float>.One);
Vector256<float> divisor = Vector256<float>.One / (Vector256<float>.One - ktmp);
r = (r * divisor) & kMask;
g = (g * divisor) & kMask;
b = (b * divisor) & kMask;
// y = 0 + (0.299 * r) + (0.587 * g) + (0.114 * b)
// cb = 128 - (0.168736 * r) - (0.331264 * g) + (0.5 * b)
// cr = 128 + (0.5 * r) - (0.418688 * g) - (0.081312 * b)
Vector256<float> y = Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
Vector256<float> cb = chromaOffset + Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(f05 * b, fn0331264, g), fn0168736, r);
Vector256<float> cr = chromaOffset + Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(fn0081312F * b, fn0418688, g), f05, r);
Unsafe.Add(ref destY, i) = y * maxSampleValue;
Unsafe.Add(ref destCb, i) = chromaOffset + (cb * maxSampleValue);
Unsafe.Add(ref destCr, i) = chromaOffset + (cr * maxSampleValue);
Unsafe.Add(ref destK, i) = ktmp * maxSampleValue;
}
}
}
}

142
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector512.cs

@ -0,0 +1,142 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal abstract partial class JpegColorConverterBase
{
internal sealed class TiffYccKVector512 : JpegColorConverterVector512
{
public TiffYccKVector512(int precision)
: base(JpegColorSpace.TiffYccK, precision)
{
}
/// <inheritdoc/>
public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
=> TiffYccKScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
/// <inheritdoc/>
protected override void ConvertToRgbInPlaceVectorized(in ComponentValues values)
{
ref Vector512<float> c0Base =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector512<float> c1Base =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector512<float> c2Base =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector512<float> c3Base =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component3));
Vector512<float> scale = Vector512.Create(1F / this.MaximumValue);
Vector512<float> chromaOffset = Vector512.Create(this.HalfValue) * scale;
Vector512<float> rCrMult = Vector512.Create(YCbCrScalar.RCrMult);
Vector512<float> gCbMult = Vector512.Create(-YCbCrScalar.GCbMult);
Vector512<float> gCrMult = Vector512.Create(-YCbCrScalar.GCrMult);
Vector512<float> bCbMult = Vector512.Create(YCbCrScalar.BCbMult);
nuint n = values.Component0.Vector512Count<float>();
for (nuint i = 0; i < n; i++)
{
ref Vector512<float> c0 = ref Unsafe.Add(ref c0Base, i);
ref Vector512<float> c1 = ref Unsafe.Add(ref c1Base, i);
ref Vector512<float> c2 = ref Unsafe.Add(ref c2Base, i);
ref Vector512<float> c3 = ref Unsafe.Add(ref c3Base, i);
Vector512<float> y = c0 * scale;
Vector512<float> cb = (c1 * scale) - chromaOffset;
Vector512<float> cr = (c2 * scale) - chromaOffset;
Vector512<float> scaledK = Vector512<float>.One - (c3 * scale);
// r = y + (1.402F * cr);
// g = y - (0.344136F * cb) - (0.714136F * cr);
// b = y + (1.772F * cb);
Vector512<float> r = Vector512_.MultiplyAdd(y, cr, rCrMult) * scaledK;
Vector512<float> g = Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(y, cb, gCbMult), cr, gCrMult) * scaledK;
Vector512<float> b = Vector512_.MultiplyAdd(y, cb, bCbMult) * scaledK;
c0 = r;
c1 = g;
c2 = b;
}
}
/// <inheritdoc/>
protected override void ConvertFromRgbVectorized(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
=> ConvertFromRgbVectorized(in values, this.MaximumValue, this.HalfValue, rLane, gLane, bLane);
/// <inheritdoc/>
protected override void ConvertToRgbInPlaceScalarRemainder(in ComponentValues values)
=> TiffYccKScalar.ConvertToRgbInPlace(values, this.MaximumValue, this.HalfValue);
/// <inheritdoc/>
protected override void ConvertFromRgbScalarRemainder(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
=> TiffYccKScalar.ConvertFromRgb(values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane);
internal static void ConvertFromRgbVectorized(in ComponentValues values, float maxValue, float halfValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
{
ref Vector512<float> srcR =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(rLane));
ref Vector512<float> srcG =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(gLane));
ref Vector512<float> srcB =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(bLane));
ref Vector512<float> destY =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector512<float> destCb =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector512<float> destCr =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector512<float> destK =
ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component3));
Vector512<float> maxSourceValue = Vector512.Create(255F);
Vector512<float> maxSampleValue = Vector512.Create(maxValue);
Vector512<float> chromaOffset = Vector512.Create(halfValue);
Vector512<float> f0299 = Vector512.Create(0.299f);
Vector512<float> f0587 = Vector512.Create(0.587f);
Vector512<float> f0114 = Vector512.Create(0.114f);
Vector512<float> fn0168736 = Vector512.Create(-0.168736f);
Vector512<float> fn0331264 = Vector512.Create(-0.331264f);
Vector512<float> fn0418688 = Vector512.Create(-0.418688f);
Vector512<float> fn0081312F = Vector512.Create(-0.081312F);
Vector512<float> f05 = Vector512.Create(0.5f);
nuint n = values.Component0.Vector512Count<float>();
for (nuint i = 0; i < n; i++)
{
Vector512<float> r = Unsafe.Add(ref srcR, i) / maxSourceValue;
Vector512<float> g = Unsafe.Add(ref srcG, i) / maxSourceValue;
Vector512<float> b = Unsafe.Add(ref srcB, i) / maxSourceValue;
Vector512<float> ktmp = Vector512<float>.One - Vector512.Max(r, Vector512.Min(g, b));
Vector512<float> kMask = ~Vector512.Equals(ktmp, Vector512<float>.One);
Vector512<float> divisor = Vector512<float>.One / (Vector512<float>.One - ktmp);
r = (r * divisor) & kMask;
g = (g * divisor) & kMask;
b = (b * divisor) & kMask;
// y = 0 + (0.299 * r) + (0.587 * g) + (0.114 * b)
// cb = 128 - (0.168736 * r) - (0.331264 * g) + (0.5 * b)
// cr = 128 + (0.5 * r) - (0.418688 * g) - (0.081312 * b)
Vector512<float> y = Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
Vector512<float> cb = chromaOffset + Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(f05 * b, fn0331264, g), fn0168736, r);
Vector512<float> cr = chromaOffset + Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(fn0081312F * b, fn0418688, g), f05, r);
Unsafe.Add(ref destY, i) = y * maxSampleValue;
Unsafe.Add(ref destCb, i) = chromaOffset + (cb * maxSampleValue);
Unsafe.Add(ref destCr, i) = chromaOffset + (cr * maxSampleValue);
Unsafe.Add(ref destK, i) = ktmp * maxSampleValue;
}
}
}
}

6
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKScalar.cs

@ -14,7 +14,7 @@ internal abstract partial class JpegColorConverterBase
{
internal sealed class YccKScalar : JpegColorConverterScalar
{
// derived from ITU-T Rec. T.871
// Derived from ITU-T Rec. T.871
internal const float RCrMult = 1.402f;
internal const float GCbMult = (float)(0.114 * 1.772 / 0.587);
internal const float GCrMult = (float)(0.299 * 1.402 / 0.587);
@ -27,7 +27,7 @@ internal abstract partial class JpegColorConverterBase
/// <inheritdoc/>
public override void ConvertToRgbInPlace(in ComponentValues values)
=> ConvertToRgpInPlace(values, this.MaximumValue, this.HalfValue);
=> ConvertToRgbInPlace(values, this.MaximumValue, this.HalfValue);
/// <inheritdoc/>
public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
@ -37,7 +37,7 @@ internal abstract partial class JpegColorConverterBase
public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
=> ConvertFromRgb(values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane);
public static void ConvertToRgpInPlace(in ComponentValues values, float maxValue, float halfValue)
public static void ConvertToRgbInPlace(in ComponentValues values, float maxValue, float halfValue)
{
Span<float> c0 = values.Component0;
Span<float> c1 = values.Component1;

10
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs

@ -82,7 +82,7 @@ internal abstract partial class JpegColorConverterBase
/// <inheritdoc/>
protected override void ConvertToRgbInPlaceScalarRemainder(in ComponentValues values)
=> YccKScalar.ConvertToRgpInPlace(values, this.MaximumValue, this.HalfValue);
=> YccKScalar.ConvertToRgbInPlace(values, this.MaximumValue, this.HalfValue);
/// <inheritdoc/>
protected override void ConvertFromRgbVectorized(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
@ -138,12 +138,6 @@ internal abstract partial class JpegColorConverterBase
/// <inheritdoc/>
protected override void ConvertFromRgbScalarRemainder(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
{
// rgb -> cmyk
CmykScalar.ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane);
// cmyk -> ycck
YccKScalar.ConvertFromRgb(in values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane);
}
=> YccKScalar.ConvertFromRgb(in values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane);
}
}

77
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverterBase.cs

@ -153,6 +153,39 @@ internal abstract partial class JpegColorConverterBase
}
}
public static void PackedNormalizeInterleave4(
ReadOnlySpan<float> xLane,
ReadOnlySpan<float> yLane,
ReadOnlySpan<float> zLane,
ReadOnlySpan<float> wLane,
Span<float> packed,
float maxValue)
{
DebugGuard.IsTrue(packed.Length % 4 == 0, "Packed length must be divisible by 4.");
DebugGuard.IsTrue(yLane.Length == xLane.Length, nameof(yLane), "Channels must be of same size!");
DebugGuard.IsTrue(zLane.Length == xLane.Length, nameof(zLane), "Channels must be of same size!");
DebugGuard.IsTrue(wLane.Length == xLane.Length, nameof(wLane), "Channels must be of same size!");
DebugGuard.MustBeLessThanOrEqualTo(packed.Length / 4, xLane.Length, nameof(packed));
float scale = 1F / maxValue;
// TODO: Investigate SIMD version of this.
ref float xLaneRef = ref MemoryMarshal.GetReference(xLane);
ref float yLaneRef = ref MemoryMarshal.GetReference(yLane);
ref float zLaneRef = ref MemoryMarshal.GetReference(zLane);
ref float wLaneRef = ref MemoryMarshal.GetReference(wLane);
ref float packedRef = ref MemoryMarshal.GetReference(packed);
for (nuint i = 0; i < (nuint)xLane.Length; i++)
{
nuint baseIdx = i * 4;
Unsafe.Add(ref packedRef, baseIdx) = Unsafe.Add(ref xLaneRef, i) * scale;
Unsafe.Add(ref packedRef, baseIdx + 1) = Unsafe.Add(ref yLaneRef, i) * scale;
Unsafe.Add(ref packedRef, baseIdx + 2) = Unsafe.Add(ref zLaneRef, i) * scale;
Unsafe.Add(ref packedRef, baseIdx + 3) = Unsafe.Add(ref wLaneRef, i) * scale;
}
}
public static void PackedInvertNormalizeInterleave4(
ReadOnlySpan<float> xLane,
ReadOnlySpan<float> yLane,
@ -198,6 +231,8 @@ internal abstract partial class JpegColorConverterBase
GetCmykConverter(8),
GetGrayScaleConverter(8),
GetRgbConverter(8),
GetTiffCmykConverter(8),
GetTiffYccKConverter(8),
// 12-bit converters
GetYCbCrConverter(12),
@ -205,6 +240,8 @@ internal abstract partial class JpegColorConverterBase
GetCmykConverter(12),
GetGrayScaleConverter(12),
GetRgbConverter(12),
GetTiffCmykConverter(12),
GetTiffYccKConverter(12),
];
/// <summary>
@ -327,6 +364,46 @@ internal abstract partial class JpegColorConverterBase
return new RgbScalar(precision);
}
private static JpegColorConverterBase GetTiffCmykConverter(int precision)
{
if (JpegColorConverterVector512.IsSupported)
{
return new TiffCmykVector512(precision);
}
if (JpegColorConverterVector256.IsSupported)
{
return new TiffCmykVector256(precision);
}
if (JpegColorConverterVector128.IsSupported)
{
return new TiffCmykVector128(precision);
}
return new TiffCmykScalar(precision);
}
private static JpegColorConverterBase GetTiffYccKConverter(int precision)
{
if (JpegColorConverterVector512.IsSupported)
{
return new TiffYccKVector512(precision);
}
if (JpegColorConverterVector256.IsSupported)
{
return new TiffYccKVector256(precision);
}
if (JpegColorConverterVector128.IsSupported)
{
return new TiffYccKVector128(precision);
}
return new TiffYccKScalar(precision);
}
/// <summary>
/// A stack-only struct to reference the input buffers using <see cref="ReadOnlySpan{T}"/>-s.
/// </summary>

3
src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs

@ -83,7 +83,8 @@ internal abstract class SpectralConverter
/// <param name="frame">The jpeg frame with the color space to convert to.</param>
/// <param name="jpegData">The raw JPEG data.</param>
/// <returns>The color converter.</returns>
protected virtual JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverterBase.GetConverter(jpegData.ColorSpace, frame.Precision);
protected virtual JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData)
=> JpegColorConverterBase.GetConverter(jpegData.ColorSpace, frame.Precision);
/// <summary>
/// Calculates image size with optional scaling.

3
src/ImageSharp/Formats/Jpeg/Components/Encoder/SpectralConverter{TPixel}.cs

@ -2,7 +2,6 @@
// Licensed under the Six Labors Split License.
using System.Buffers;
using SixLabors.ImageSharp.Advanced;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
@ -109,6 +108,8 @@ internal class SpectralConverter<TPixel> : SpectralConverter, IDisposable
int y = yy - this.pixelRowCounter;
// Unpack TPixel to r/g/b planes
// TODO: The individual implementation code would be much easier here if
// we scaled to [0-1] before passing to the individual converters.
int srcIndex = Math.Min(yy, pixelBufferLastVerticalIndex);
Span<TPixel> sourceRow = this.pixelBuffer.DangerousGetRowSpan(srcIndex);
PixelOperations<TPixel>.Instance.UnpackIntoRgbPlanes(rLane, gLane, bLane, sourceRow);

10
src/ImageSharp/Formats/Jpeg/Components/JpegColorSpace.cs

@ -23,6 +23,16 @@ internal enum JpegColorSpace
/// </summary>
Cmyk,
/// <summary>
/// YccK color space with 4 components, used with tiff images, which use jpeg compression.
/// </summary>
TiffYccK,
/// <summary>
/// Cmyk color space with 4 components, used with tiff images, which use jpeg compression.
/// </summary>
TiffCmyk,
/// <summary>
/// Color space with 3 components.
/// </summary>

18
src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs

@ -115,12 +115,14 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData
/// Initializes a new instance of the <see cref="JpegDecoderCore"/> class.
/// </summary>
/// <param name="options">The decoder options.</param>
public JpegDecoderCore(JpegDecoderOptions options)
/// <param name="iccProfile">The ICC profile to use for color conversion.</param>
public JpegDecoderCore(JpegDecoderOptions options, IccProfile iccProfile = null)
: base(options.GeneralOptions)
{
this.resizeMode = options.ResizeMode;
this.configuration = options.GeneralOptions.Configuration;
this.skipMetadata = options.GeneralOptions.SkipMetadata;
this.SetIccMetadata(iccProfile);
}
/// <summary>
@ -231,7 +233,7 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData
/// <param name="scanDecoder">The scan decoder.</param>
public void LoadTables(byte[] tableBytes, IJpegScanDecoder scanDecoder)
{
this.Metadata = new ImageMetadata();
this.Metadata ??= new ImageMetadata();
this.QuantizationTables = new Block8x8F[4];
this.scanDecoder = scanDecoder;
if (tableBytes.Length < 4)
@ -314,7 +316,7 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData
this.scanDecoder ??= new HuffmanScanDecoder(stream, spectralConverter, cancellationToken);
this.Metadata = new ImageMetadata();
this.Metadata ??= new ImageMetadata();
Span<byte> markerBuffer = stackalloc byte[2];
@ -678,6 +680,16 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData
}
}
private void SetIccMetadata(IccProfile profile)
{
if (!this.skipMetadata && profile?.CheckIsValid() == true)
{
this.hasIcc = true;
this.Metadata ??= new ImageMetadata();
this.Metadata.IccProfile = profile;
}
}
/// <summary>
/// Initializes the IPTC profile.
/// </summary>

9
src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs

@ -6,6 +6,7 @@ using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder;
using SixLabors.ImageSharp.Formats.Tiff.Constants;
using SixLabors.ImageSharp.IO;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.Metadata;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
using SixLabors.ImageSharp.PixelFormats;
@ -22,6 +23,8 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor
private readonly TiffPhotometricInterpretation photometricInterpretation;
private readonly ImageFrameMetadata metadata;
/// <summary>
/// Initializes a new instance of the <see cref="JpegTiffCompression"/> class.
/// </summary>
@ -29,6 +32,7 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor
/// <param name="memoryAllocator">The memoryAllocator to use for buffer allocations.</param>
/// <param name="width">The image width.</param>
/// <param name="bitsPerPixel">The bits per pixel.</param>
/// <param name="metadata">The image frame metadata.</param>
/// <param name="jpegTables">The JPEG tables containing the quantization and/or Huffman tables.</param>
/// <param name="photometricInterpretation">The photometric interpretation.</param>
public JpegTiffCompression(
@ -36,11 +40,13 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor
MemoryAllocator memoryAllocator,
int width,
int bitsPerPixel,
ImageFrameMetadata metadata,
byte[] jpegTables,
TiffPhotometricInterpretation photometricInterpretation)
: base(memoryAllocator, width, bitsPerPixel)
{
this.options = options;
this.metadata = metadata;
this.jpegTables = jpegTables;
this.photometricInterpretation = photometricInterpretation;
}
@ -61,7 +67,7 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor
private void DecodeJpegData(BufferedReadStream stream, Span<byte> buffer, CancellationToken cancellationToken)
{
using JpegDecoderCore jpegDecoder = new(this.options);
using JpegDecoderCore jpegDecoder = new(this.options, this.metadata.IccProfile);
Configuration configuration = this.options.GeneralOptions.Configuration;
switch (this.photometricInterpretation)
{
@ -85,6 +91,7 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor
case TiffPhotometricInterpretation.YCbCr:
case TiffPhotometricInterpretation.Rgb:
case TiffPhotometricInterpretation.Separated:
{
using SpectralConverter<Rgb24> spectralConverter = new TiffJpegSpectralConverter<Rgb24>(configuration, this.photometricInterpretation);
HuffmanScanDecoder scanDecoder = new(stream, spectralConverter, cancellationToken);

8
src/ImageSharp/Formats/Tiff/Compression/Decompressors/OldJpegTiffCompression.cs

@ -6,6 +6,7 @@ using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder;
using SixLabors.ImageSharp.Formats.Tiff.Constants;
using SixLabors.ImageSharp.IO;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.Metadata;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
using SixLabors.ImageSharp.PixelFormats;
@ -17,6 +18,8 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor
private readonly uint startOfImageMarker;
private readonly ImageFrameMetadata metadata;
private readonly TiffPhotometricInterpretation photometricInterpretation;
public OldJpegTiffCompression(
@ -24,12 +27,14 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor
MemoryAllocator memoryAllocator,
int width,
int bitsPerPixel,
ImageFrameMetadata metadata,
uint startOfImageMarker,
TiffPhotometricInterpretation photometricInterpretation)
: base(memoryAllocator, width, bitsPerPixel)
{
this.options = options;
this.startOfImageMarker = startOfImageMarker;
this.metadata = metadata;
this.photometricInterpretation = photometricInterpretation;
}
@ -47,7 +52,7 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor
private void DecodeJpegData(BufferedReadStream stream, Span<byte> buffer, CancellationToken cancellationToken)
{
using JpegDecoderCore jpegDecoder = new(this.options);
using JpegDecoderCore jpegDecoder = new(this.options, this.metadata.IccProfile);
Configuration configuration = this.options.GeneralOptions.Configuration;
switch (this.photometricInterpretation)
{
@ -71,6 +76,7 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor
case TiffPhotometricInterpretation.YCbCr:
case TiffPhotometricInterpretation.Rgb:
case TiffPhotometricInterpretation.Separated:
{
using SpectralConverter<Rgb24> spectralConverter = new TiffOldJpegSpectralConverter<Rgb24>(configuration, this.photometricInterpretation);

29
src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffJpegSpectralConverter{TPixel}.cs

@ -31,19 +31,30 @@ internal sealed class TiffJpegSpectralConverter<TPixel> : SpectralConverter<TPix
/// <inheritdoc/>
protected override JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData)
{
JpegColorSpace colorSpace = GetJpegColorSpaceFromPhotometricInterpretation(this.photometricInterpretation);
JpegColorSpace colorSpace = GetJpegColorSpace(this.photometricInterpretation, jpegData);
return JpegColorConverterBase.GetConverter(colorSpace, frame.Precision);
}
/// <summary>
/// This converter must be used only for RGB and YCbCr color spaces for performance reasons.
/// Photometric interpretation Rgb and YCbCr will be mapped to RGB colorspace, which means the jpeg decompression will leave the data as is (no color conversion).
/// The color conversion will be done after the decompression. For Separated/CMYK/YCCK, the jpeg color converter will handle the color conversion,
/// since the jpeg color converter needs to return RGB data and cannot return 4 component data.
/// For grayscale images <see cref="GrayJpegSpectralConverter{TPixel}"/> must be used.
/// </summary>
private static JpegColorSpace GetJpegColorSpaceFromPhotometricInterpretation(TiffPhotometricInterpretation interpretation)
=> interpretation switch
{
TiffPhotometricInterpretation.Rgb => JpegColorSpace.RGB,
TiffPhotometricInterpretation.YCbCr => JpegColorSpace.RGB,
_ => throw new InvalidImageContentException($"Invalid tiff photometric interpretation for jpeg encoding: {interpretation}"),
};
/// <param name="interpretation">
/// The <see cref="TiffPhotometricInterpretation"/> to convert to a <see cref="JpegColorSpace"/>.
/// </param>
/// <param name="data">
/// The <see cref="IRawJpegData"/> containing the color space information.
/// </param>
/// <exception cref="InvalidImageContentException">
/// Thrown when the <paramref name="interpretation"/> is not supported for JPEG encoding.
/// </exception>
private static JpegColorSpace GetJpegColorSpace(TiffPhotometricInterpretation interpretation, IRawJpegData data) => interpretation switch
{
TiffPhotometricInterpretation.Rgb => JpegColorSpace.RGB,
TiffPhotometricInterpretation.Separated => data.ColorSpace == JpegColorSpace.Ycck ? JpegColorSpace.TiffYccK : JpegColorSpace.TiffCmyk,
TiffPhotometricInterpretation.YCbCr => JpegColorSpace.RGB, // TODO: Why doesn't this use the YCbCr color space?
_ => throw new InvalidImageContentException($"Invalid TIFF photometric interpretation for JPEG encoding: {interpretation}"),
};
}

5
src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffOldJpegSpectralConverter{TPixel}.cs

@ -30,15 +30,16 @@ internal sealed class TiffOldJpegSpectralConverter<TPixel> : SpectralConverter<T
/// <inheritdoc/>
protected override JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData)
{
JpegColorSpace colorSpace = GetJpegColorSpaceFromPhotometricInterpretation(this.photometricInterpretation);
JpegColorSpace colorSpace = GetJpegColorSpaceFromPhotometricInterpretation(this.photometricInterpretation, jpegData);
return JpegColorConverterBase.GetConverter(colorSpace, frame.Precision);
}
private static JpegColorSpace GetJpegColorSpaceFromPhotometricInterpretation(TiffPhotometricInterpretation interpretation)
private static JpegColorSpace GetJpegColorSpaceFromPhotometricInterpretation(TiffPhotometricInterpretation interpretation, IRawJpegData data)
=> interpretation switch
{
// Like libtiff: Always treat the pixel data as YCbCr when the data is compressed with old jpeg compression.
TiffPhotometricInterpretation.Rgb => JpegColorSpace.YCbCr,
TiffPhotometricInterpretation.Separated => data.ColorSpace == JpegColorSpace.Ycck ? JpegColorSpace.TiffYccK : JpegColorSpace.TiffCmyk,
TiffPhotometricInterpretation.YCbCr => JpegColorSpace.YCbCr,
_ => throw new InvalidImageContentException($"Invalid tiff photometric interpretation for jpeg encoding: {interpretation}"),
};

6
src/ImageSharp/Formats/Tiff/Compression/TiffDecompressorsFactory.cs

@ -5,6 +5,7 @@ using SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors;
using SixLabors.ImageSharp.Formats.Tiff.Constants;
using SixLabors.ImageSharp.Formats.Tiff.PhotometricInterpretation;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.Metadata;
namespace SixLabors.ImageSharp.Formats.Tiff.Compression;
@ -17,6 +18,7 @@ internal static class TiffDecompressorsFactory
TiffPhotometricInterpretation photometricInterpretation,
int width,
int bitsPerPixel,
ImageFrameMetadata metadata,
TiffColorType colorType,
TiffPredictor predictor,
FaxCompressionOptions faxOptions,
@ -62,11 +64,11 @@ internal static class TiffDecompressorsFactory
case TiffDecoderCompressionType.Jpeg:
DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression");
return new JpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, jpegTables, photometricInterpretation);
return new JpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, metadata, jpegTables, photometricInterpretation);
case TiffDecoderCompressionType.OldJpeg:
DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression");
return new OldJpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, oldJpegStartOfImageMarker, photometricInterpretation);
return new OldJpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, metadata, oldJpegStartOfImageMarker, photometricInterpretation);
case TiffDecoderCompressionType.Webp:
DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression");

23
src/ImageSharp/Formats/Tiff/PhotometricInterpretation/CmykTiffColor{TPixel}.cs

@ -1,7 +1,9 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using SixLabors.ImageSharp.ColorProfiles;
using SixLabors.ImageSharp.Formats.Tiff.Compression;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
@ -13,10 +15,31 @@ internal class CmykTiffColor<TPixel> : TiffBaseColorDecoder<TPixel>
private static readonly ColorProfileConverter ColorProfileConverter = new();
private const float Inv255 = 1f / 255f;
private readonly TiffDecoderCompressionType compression;
public CmykTiffColor(TiffDecoderCompressionType compression) => this.compression = compression;
/// <inheritdoc/>
public override void Decode(ReadOnlySpan<byte> data, Buffer2D<TPixel> pixels, int left, int top, int width, int height)
{
int offset = 0;
if (this.compression == TiffDecoderCompressionType.Jpeg)
{
for (int y = top; y < top + height; y++)
{
Span<TPixel> pixelRow = pixels.DangerousGetRowSpan(y).Slice(left, width);
for (int x = 0; x < pixelRow.Length; x++)
{
pixelRow[x] = TPixel.FromVector4(new Vector4(data[offset] * Inv255, data[offset + 1] * Inv255, data[offset + 2] * Inv255, 1.0f));
offset += 3;
}
}
return;
}
for (int y = top; y < top + height; y++)
{
Span<TPixel> pixelRow = pixels.DangerousGetRowSpan(y).Slice(left, width);

4
src/ImageSharp/Formats/Tiff/PhotometricInterpretation/TiffColorDecoderFactory{TPixel}.cs

@ -1,6 +1,7 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using SixLabors.ImageSharp.Formats.Tiff.Compression;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
@ -19,6 +20,7 @@ internal static class TiffColorDecoderFactory<TPixel>
Rational[] referenceBlackAndWhite,
Rational[] ycbcrCoefficients,
ushort[] ycbcrSubSampling,
TiffDecoderCompressionType compression,
ByteOrder byteOrder)
{
switch (colorType)
@ -410,7 +412,7 @@ internal static class TiffColorDecoderFactory<TPixel>
&& bitsPerSample.Channel1 == 8
&& bitsPerSample.Channel0 == 8,
"bitsPerSample");
return new CmykTiffColor<TPixel>();
return new CmykTiffColor<TPixel>(compression);
default:
throw TiffThrowHelper.InvalidColorType(colorType.ToString());

25
src/ImageSharp/Formats/Tiff/TiffDecoderCore.cs

@ -11,6 +11,7 @@ using SixLabors.ImageSharp.IO;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.Metadata;
using SixLabors.ImageSharp.Metadata.Profiles.Exif;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Formats.Tiff;
@ -280,6 +281,12 @@ internal class TiffDecoderCore : ImageDecoderCore
if (!this.skipMetadata)
{
imageFrameMetaData.ExifProfile = tags;
// We resolve the ICC profile early so that we can use it for color conversion if needed.
if (tags.TryGetValue(ExifTag.IccProfile, out IExifValue<byte[]> iccProfileBytes))
{
imageFrameMetaData.IccProfile = new IccProfile(iccProfileBytes.Value);
}
}
TiffFrameMetadata tiffMetadata = TiffFrameMetadata.Parse(tags);
@ -438,7 +445,7 @@ internal class TiffDecoderCore : ImageDecoderCore
stripBuffers[stripIndex] = this.memoryAllocator.Allocate<byte>(uncompressedStripSize);
}
using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(width, bitsPerPixel);
using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(width, bitsPerPixel, frame.Metadata);
TiffBasePlanarColorDecoder<TPixel> colorDecoder = this.CreatePlanarColorDecoder<TPixel>();
for (int i = 0; i < stripsPerPlane; i++)
@ -507,7 +514,7 @@ internal class TiffDecoderCore : ImageDecoderCore
Span<byte> stripBufferSpan = stripBuffer.GetSpan();
Buffer2D<TPixel> pixels = frame.PixelBuffer;
using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(width, bitsPerPixel);
using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(width, bitsPerPixel, frame.Metadata);
TiffBaseColorDecoder<TPixel> colorDecoder = this.CreateChunkyColorDecoder<TPixel>();
for (int stripIndex = 0; stripIndex < stripOffsets.Length; stripIndex++)
@ -578,7 +585,7 @@ internal class TiffDecoderCore : ImageDecoderCore
tilesBuffers[i] = this.memoryAllocator.Allocate<byte>(uncompressedTilesSize, AllocationOptions.Clean);
}
using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(frame.Width, bitsPerPixel);
using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(frame.Width, bitsPerPixel, frame.Metadata);
TiffBasePlanarColorDecoder<TPixel> colorDecoder = this.CreatePlanarColorDecoder<TPixel>();
int tileIndex = 0;
@ -679,7 +686,7 @@ internal class TiffDecoderCore : ImageDecoderCore
using IMemoryOwner<byte> tileBuffer = this.memoryAllocator.Allocate<byte>(bytesPerTileRow * tileLength, AllocationOptions.Clean);
Span<byte> tileBufferSpan = tileBuffer.GetSpan();
using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(frame.Width, bitsPerPixel, true, tileWidth, tileLength);
using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(frame.Width, bitsPerPixel, frame.Metadata, true, tileWidth, tileLength);
TiffBaseColorDecoder<TPixel> colorDecoder = this.CreateChunkyColorDecoder<TPixel>();
int tileIndex = 0;
@ -733,6 +740,7 @@ internal class TiffDecoderCore : ImageDecoderCore
this.ReferenceBlackAndWhite,
this.YcbcrCoefficients,
this.YcbcrSubSampling,
this.CompressionType,
this.byteOrder);
private TiffBasePlanarColorDecoder<TPixel> CreatePlanarColorDecoder<TPixel>()
@ -747,7 +755,13 @@ internal class TiffDecoderCore : ImageDecoderCore
this.YcbcrSubSampling,
this.byteOrder);
private TiffBaseDecompressor CreateDecompressor<TPixel>(int frameWidth, int bitsPerPixel, bool isTiled = false, int tileWidth = 0, int tileHeight = 0)
private TiffBaseDecompressor CreateDecompressor<TPixel>(
int frameWidth,
int bitsPerPixel,
ImageFrameMetadata metadata,
bool isTiled = false,
int tileWidth = 0,
int tileHeight = 0)
where TPixel : unmanaged, IPixel<TPixel> =>
TiffDecompressorsFactory.Create(
this.Options,
@ -756,6 +770,7 @@ internal class TiffDecoderCore : ImageDecoderCore
this.PhotometricInterpretation,
frameWidth,
bitsPerPixel,
metadata,
this.ColorType,
this.Predictor,
this.FaxCompressionOptions,

8
src/ImageSharp/Formats/Tiff/TiffDecoderMetadataCreator.cs

@ -5,7 +5,6 @@
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Metadata;
using SixLabors.ImageSharp.Metadata.Profiles.Exif;
using SixLabors.ImageSharp.Metadata.Profiles.Icc;
using SixLabors.ImageSharp.Metadata.Profiles.Iptc;
using SixLabors.ImageSharp.Metadata.Profiles.Xmp;
@ -29,6 +28,8 @@ internal static class TiffDecoderMetadataCreator
{
for (int i = 0; i < frames.Count; i++)
{
// ICC profile data has already been resolved in the frame metadata,
// as it is required for color conversion.
ImageFrameMetadata frameMetaData = frames[i];
if (TryGetIptc(frameMetaData.ExifProfile.Values, out byte[] iptcBytes))
{
@ -39,11 +40,6 @@ internal static class TiffDecoderMetadataCreator
{
frameMetaData.XmpProfile = new XmpProfile(xmpProfileBytes.Value);
}
if (frameMetaData.ExifProfile.TryGetValue(ExifTag.IccProfile, out IExifValue<byte[]> iccProfileBytes))
{
frameMetaData.IccProfile = new IccProfile(iccProfileBytes.Value);
}
}
}

7
src/ImageSharp/Formats/Webp/AlphaDecoder.cs

@ -6,7 +6,6 @@ using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Formats.Webp.BitReader;
@ -314,7 +313,7 @@ internal class AlphaDecoder : IDisposable
private static void HorizontalUnfilter(Span<byte> prev, Span<byte> input, Span<byte> dst, int width)
{
if ((Sse2.IsSupported || AdvSimd.IsSupported) && width >= 9)
if (Vector128.IsHardwareAccelerated && width >= 9)
{
dst[0] = (byte)(input[0] + (prev.IsEmpty ? 0 : prev[0]));
nuint i;
@ -362,7 +361,7 @@ internal class AlphaDecoder : IDisposable
{
HorizontalUnfilter(null, input, dst, width);
}
else if (Avx2.IsSupported)
else if (Vector256.IsHardwareAccelerated)
{
ref byte inputRef = ref MemoryMarshal.GetReference(input);
ref byte prevRef = ref MemoryMarshal.GetReference(prev);
@ -374,7 +373,7 @@ internal class AlphaDecoder : IDisposable
{
Vector256<int> a0 = Unsafe.As<byte, Vector256<int>>(ref Unsafe.Add(ref inputRef, i));
Vector256<int> b0 = Unsafe.As<byte, Vector256<int>>(ref Unsafe.Add(ref prevRef, i));
Vector256<byte> c0 = Avx2.Add(a0.AsByte(), b0.AsByte());
Vector256<byte> c0 = a0.AsByte() + b0.AsByte();
ref byte outputRef = ref Unsafe.Add(ref dstRef, i);
Unsafe.As<byte, Vector256<byte>>(ref outputRef) = c0;
}

145
src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs

@ -4,7 +4,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Webp.Lossless;
@ -12,17 +12,20 @@ internal static class ColorSpaceTransformUtils
{
public static void CollectColorBlueTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
{
if (Avx2.IsSupported && tileWidth >= 16)
if (Vector256.IsHardwareAccelerated && tileWidth >= 16)
{
const int span = 16;
Span<ushort> values = stackalloc ushort[span];
var collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
var collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
var collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
var collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
var collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
// These shuffle masks are safe for use with Avx2.Shuffle because all indices are within their respective 128-bit lanes (0–15 for the low mask, 16–31 for the high mask),
// and all disabled lanes are set to 0xFF to zero those bytes per the vpshufb specification. This guarantees lane-local shuffling with no cross-lane violations.
Vector256<byte> collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
Vector256<byte> collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
Vector256<byte> collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
Vector256<byte> collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector256<byte> collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
Vector256<short> multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
for (int y = 0; y < tileHeight; y++)
{
Span<uint> srcSpan = bgra[(y * stride)..];
@ -33,18 +36,18 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector256<byte> r0 = Avx2.Shuffle(input0, collectColorBlueTransformsShuffleLowMask256);
Vector256<byte> r1 = Avx2.Shuffle(input1, collectColorBlueTransformsShuffleHighMask256);
Vector256<byte> r = Avx2.Or(r0, r1);
Vector256<byte> gb0 = Avx2.And(input0, collectColorBlueTransformsGreenBlueMask256);
Vector256<byte> gb1 = Avx2.And(input1, collectColorBlueTransformsGreenBlueMask256);
Vector256<ushort> gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector256<byte> g = Avx2.And(gb.AsByte(), collectColorBlueTransformsGreenMask256);
Vector256<short> a = Avx2.MultiplyHigh(r.AsInt16(), multsr);
Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg);
Vector256<byte> c = Avx2.Subtract(gb.AsByte(), b.AsByte());
Vector256<byte> d = Avx2.Subtract(c, a.AsByte());
Vector256<byte> e = Avx2.And(d, collectColorBlueTransformsBlueMask256);
Vector256<byte> r0 = Vector256_.ShufflePerLane(input0, collectColorBlueTransformsShuffleLowMask256);
Vector256<byte> r1 = Vector256_.ShufflePerLane(input1, collectColorBlueTransformsShuffleHighMask256);
Vector256<byte> r = r0 | r1;
Vector256<byte> gb0 = input0 & collectColorBlueTransformsGreenBlueMask256;
Vector256<byte> gb1 = input1 & collectColorBlueTransformsGreenBlueMask256;
Vector256<ushort> gb = Vector256_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector256<byte> g = gb.AsByte() & collectColorBlueTransformsGreenMask256;
Vector256<short> a = Vector256_.MultiplyHigh(r.AsInt16(), multsr);
Vector256<short> b = Vector256_.MultiplyHigh(g.AsInt16(), multsg);
Vector256<byte> c = gb.AsByte() - b.AsByte();
Vector256<byte> d = c - a.AsByte();
Vector256<byte> e = d & collectColorBlueTransformsBlueMask256;
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = e.AsUInt16();
@ -59,20 +62,20 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
}
}
else if (Sse41.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
const int span = 8;
Span<ushort> values = stackalloc ushort[span];
var collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
var collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
var collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
var collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
var collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
Vector128<byte> collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
Vector128<byte> collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
Vector128<byte> collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
Vector128<byte> collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector128<byte> collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
Vector128<short> multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
for (int y = 0; y < tileHeight; y++)
{
Span<uint> srcSpan = bgra[(y * stride)..];
@ -83,18 +86,18 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector128<byte> r0 = Ssse3.Shuffle(input0, collectColorBlueTransformsShuffleLowMask);
Vector128<byte> r1 = Ssse3.Shuffle(input1, collectColorBlueTransformsShuffleHighMask);
Vector128<byte> r = Sse2.Or(r0, r1);
Vector128<byte> gb0 = Sse2.And(input0, collectColorBlueTransformsGreenBlueMask);
Vector128<byte> gb1 = Sse2.And(input1, collectColorBlueTransformsGreenBlueMask);
Vector128<ushort> gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector128<byte> g = Sse2.And(gb.AsByte(), collectColorBlueTransformsGreenMask);
Vector128<short> a = Sse2.MultiplyHigh(r.AsInt16(), multsr);
Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg);
Vector128<byte> c = Sse2.Subtract(gb.AsByte(), b.AsByte());
Vector128<byte> d = Sse2.Subtract(c, a.AsByte());
Vector128<byte> e = Sse2.And(d, collectColorBlueTransformsBlueMask);
Vector128<byte> r0 = Vector128_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask);
Vector128<byte> r1 = Vector128_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask);
Vector128<byte> r = r0 | r1;
Vector128<byte> gb0 = input0 & collectColorBlueTransformsGreenBlueMask;
Vector128<byte> gb1 = input1 & collectColorBlueTransformsGreenBlueMask;
Vector128<ushort> gb = Vector128_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector128<byte> g = gb.AsByte() & collectColorBlueTransformsGreenMask;
Vector128<short> a = Vector128_.MultiplyHigh(r.AsInt16(), multsr);
Vector128<short> b = Vector128_.MultiplyHigh(g.AsInt16(), multsg);
Vector128<byte> c = gb.AsByte() - b.AsByte();
Vector128<byte> d = c - a.AsByte();
Vector128<byte> e = d & collectColorBlueTransformsBlueMask;
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = e.AsUInt16();
@ -109,16 +112,16 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
}
}
else
{
CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
CollectColorBlueTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
}
}
private static void CollectColorBlueTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
private static void CollectColorBlueTransformsScalar(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
{
int pos = 0;
while (tileHeight-- > 0)
@ -135,11 +138,11 @@ internal static class ColorSpaceTransformUtils
public static void CollectColorRedTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
{
if (Avx2.IsSupported && tileWidth >= 16)
if (Vector256.IsHardwareAccelerated && tileWidth >= 16)
{
Vector256<byte> collectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte();
Vector256<byte> collectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte();
var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
const int span = 16;
Span<ushort> values = stackalloc ushort[span];
for (int y = 0; y < tileHeight; y++)
@ -152,15 +155,15 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector256<byte> g0 = Avx2.And(input0, collectColorRedTransformsGreenMask256); // 0 0 | g 0
Vector256<byte> g1 = Avx2.And(input1, collectColorRedTransformsGreenMask256);
Vector256<ushort> g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector256<int> a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector256<int> a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16);
Vector256<ushort> a = Avx2.PackUnsignedSaturate(a0, a1); // x r
Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector256<byte> c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r'
Vector256<byte> d = Avx2.And(c, collectColorRedTransformsAndMask256); // 0 r'
Vector256<byte> g0 = input0 & collectColorRedTransformsGreenMask256; // 0 0 | g 0
Vector256<byte> g1 = input1 & collectColorRedTransformsGreenMask256;
Vector256<ushort> g = Vector256_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector256<int> a0 = Vector256.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector256<int> a1 = Vector256.ShiftRightLogical(input1.AsInt32(), 16);
Vector256<ushort> a = Vector256_.PackUnsignedSaturate(a0, a1); // x r
Vector256<short> b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector256<byte> c = a.AsByte() - b.AsByte(); // x r'
Vector256<byte> d = c & collectColorRedTransformsAndMask256; // 0 r'
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = d.AsUInt16();
@ -175,14 +178,14 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
}
}
else if (Sse41.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> collectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte();
Vector128<byte> collectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte();
var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
const int span = 8;
Span<ushort> values = stackalloc ushort[span];
for (int y = 0; y < tileHeight; y++)
@ -195,15 +198,15 @@ internal static class ColorSpaceTransformUtils
nuint input1Idx = x + (span / 2);
Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector128<byte> g0 = Sse2.And(input0, collectColorRedTransformsGreenMask); // 0 0 | g 0
Vector128<byte> g1 = Sse2.And(input1, collectColorRedTransformsGreenMask);
Vector128<ushort> g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector128<int> a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector128<int> a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16);
Vector128<ushort> a = Sse41.PackUnsignedSaturate(a0, a1); // x r
Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector128<byte> c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r'
Vector128<byte> d = Sse2.And(c, collectColorRedTransformsAndMask); // 0 r'
Vector128<byte> g0 = input0 & collectColorRedTransformsGreenMask; // 0 0 | g 0
Vector128<byte> g1 = input1 & collectColorRedTransformsGreenMask;
Vector128<ushort> g = Vector128_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector128<int> a0 = Vector128.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector128<int> a1 = Vector128.ShiftRightLogical(input1.AsInt32(), 16);
Vector128<ushort> a = Vector128_.PackUnsignedSaturate(a0, a1); // x r
Vector128<short> b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector128<byte> c = a.AsByte() - b.AsByte(); // x r'
Vector128<byte> d = c & collectColorRedTransformsAndMask; // 0 r'
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = d.AsUInt16();
@ -218,16 +221,16 @@ internal static class ColorSpaceTransformUtils
int leftOver = tileWidth & (span - 1);
if (leftOver > 0)
{
CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
}
}
else
{
CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
CollectColorRedTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
}
}
private static void CollectColorRedTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
private static void CollectColorRedTransformsScalar(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
{
int pos = 0;
while (tileHeight-- > 0)

196
src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs

@ -6,6 +6,7 @@ using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Memory;
namespace SixLabors.ImageSharp.Formats.Webp.Lossless;
@ -94,17 +95,20 @@ internal static unsafe class LosslessUtils
/// <param name="pixelData">The pixel data to apply the transformation.</param>
public static void AddGreenToBlueAndRed(Span<uint> pixelData)
{
if (Avx2.IsSupported && pixelData.Length >= 8)
if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8)
{
Vector256<byte> addGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
// The `255` values disable the write for alpha (A), since 0x80 is set in the control byte (high bit set).
// Each byte index is within its respective 128-bit lane (0–15 and 16–31), so this is safe for per-lane shuffle.
// The high bits are not set for the index bytes, and the values are always < 16 per lane, satisfying AVX2 lane rules.
Vector256<byte> addGreenToBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
Vector256<byte> in0g0g = Avx2.Shuffle(input, addGreenToBlueAndRedMaskAvx2);
Vector256<byte> output = Avx2.Add(input, in0g0g);
Vector256<byte> in0g0g = Vector256_.ShufflePerLane(input, addGreenToBlueAndRedMask);
Vector256<byte> output = input + in0g0g;
Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
i += 8;
}
@ -115,39 +119,17 @@ internal static unsafe class LosslessUtils
AddGreenToBlueAndRedScalar(pixelData[(int)i..]);
}
}
else if (Ssse3.IsSupported && pixelData.Length >= 4)
{
Vector128<byte> addGreenToBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<byte> in0g0g = Ssse3.Shuffle(input, addGreenToBlueAndRedMaskSsse3);
Vector128<byte> output = Sse2.Add(input, in0g0g);
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
i += 4;
}
while (i <= numPixels - 4);
if (i != numPixels)
{
AddGreenToBlueAndRedScalar(pixelData[(int)i..]);
}
}
else if (Sse2.IsSupported && pixelData.Length >= 4)
else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
{
Vector128<byte> addGreenToBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
Vector128<ushort> b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200);
Vector128<ushort> c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g
Vector128<byte> output = Sse2.Add(input.AsByte(), c.AsByte());
Vector128<byte> in0g0g = Vector128_.ShuffleNative(input, addGreenToBlueAndRedMask);
Vector128<byte> output = input + in0g0g;
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
i += 4;
}
@ -180,17 +162,17 @@ internal static unsafe class LosslessUtils
public static void SubtractGreenFromBlueAndRed(Span<uint> pixelData)
{
if (Avx2.IsSupported && pixelData.Length >= 8)
if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8)
{
Vector256<byte> subtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
Vector256<byte> subtractGreenFromBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
Vector256<byte> in0g0g = Avx2.Shuffle(input, subtractGreenFromBlueAndRedMaskAvx2);
Vector256<byte> output = Avx2.Subtract(input, in0g0g);
Vector256<byte> in0g0g = Vector256_.ShufflePerLane(input, subtractGreenFromBlueAndRedMask);
Vector256<byte> output = input - in0g0g;
Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
i += 8;
}
@ -201,39 +183,17 @@ internal static unsafe class LosslessUtils
SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]);
}
}
else if (Ssse3.IsSupported && pixelData.Length >= 4)
{
Vector128<byte> subtractGreenFromBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<byte> in0g0g = Ssse3.Shuffle(input, subtractGreenFromBlueAndRedMaskSsse3);
Vector128<byte> output = Sse2.Subtract(input, in0g0g);
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
i += 4;
}
while (i <= numPixels - 4);
if (i != numPixels)
{
SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]);
}
}
else if (Sse2.IsSupported && pixelData.Length >= 4)
else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
{
Vector128<byte> subtractGreenFromBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
nuint numPixels = (uint)pixelData.Length;
nuint i = 0;
do
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
Vector128<ushort> b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200);
Vector128<ushort> c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g
Vector128<byte> output = Sse2.Subtract(input.AsByte(), c.AsByte());
Vector128<byte> in0g0g = Vector128_.ShuffleNative(input, subtractGreenFromBlueAndRedMask);
Vector128<byte> output = input - in0g0g;
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
i += 4;
}
@ -412,7 +372,7 @@ internal static unsafe class LosslessUtils
TransformColorScalar(m, pixelData[(int)idx..], numPixels - (int)idx);
}
}
else if (Sse2.IsSupported && numPixels >= 4)
else if (Vector128.IsHardwareAccelerated && numPixels >= 4)
{
Vector128<byte> transformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector128<byte> transformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
@ -423,16 +383,16 @@ internal static unsafe class LosslessUtils
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
Vector128<byte> a = Sse2.And(input.AsByte(), transformColorAlphaGreenMask);
Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector128<short> e = Sse2.ShiftLeftLogical(input.AsInt16(), 8);
Vector128<short> f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
Vector128<int> g = Sse2.ShiftRightLogical(f.AsInt32(), 16);
Vector128<byte> h = Sse2.Add(g.AsByte(), d.AsByte());
Vector128<byte> i = Sse2.And(h, transformColorRedBlueMask);
Vector128<byte> output = Sse2.Subtract(input.AsByte(), i);
Vector128<byte> a = input.AsByte() & transformColorAlphaGreenMask;
Vector128<short> b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector128<short> e = Vector128_.ShiftLeftLogical(input.AsInt16(), 8);
Vector128<short> f = Vector128_.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
Vector128<int> g = Vector128.ShiftRightLogical(f.AsInt32(), 16);
Vector128<byte> h = g.AsByte() + d.AsByte();
Vector128<byte> i = h & transformColorRedBlueMask;
Vector128<byte> output = input.AsByte() - i;
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
idx += 4;
}
@ -503,7 +463,7 @@ internal static unsafe class LosslessUtils
TransformColorInverseScalar(m, pixelData[(int)idx..]);
}
}
else if (Sse2.IsSupported && pixelData.Length >= 4)
else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
{
Vector128<byte> transformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector128<int> multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
@ -514,17 +474,17 @@ internal static unsafe class LosslessUtils
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
Vector128<byte> a = Sse2.And(input.AsByte(), transformColorInverseAlphaGreenMask);
Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector128<byte> e = Sse2.Add(input.AsByte(), d.AsByte());
Vector128<short> f = Sse2.ShiftLeftLogical(e.AsInt16(), 8);
Vector128<short> g = Sse2.MultiplyHigh(f, multsb2.AsInt16());
Vector128<int> h = Sse2.ShiftRightLogical(g.AsInt32(), 8);
Vector128<byte> i = Sse2.Add(h.AsByte(), f.AsByte());
Vector128<short> j = Sse2.ShiftRightLogical(i.AsInt16(), 8);
Vector128<byte> output = Sse2.Or(j.AsByte(), a);
Vector128<byte> a = input.AsByte() & transformColorInverseAlphaGreenMask;
Vector128<short> b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
Vector128<short> d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector128<byte> e = input.AsByte() + d.AsByte();
Vector128<short> f = Vector128_.ShiftLeftLogical(e.AsInt16(), 8);
Vector128<short> g = Vector128_.MultiplyHigh(f, multsb2.AsInt16());
Vector128<int> h = Vector128.ShiftRightLogical(g.AsInt32(), 8);
Vector128<byte> i = h.AsByte() + f.AsByte();
Vector128<short> j = Vector128.ShiftRightLogical(i.AsInt16(), 8);
Vector128<byte> output = j.AsByte() | a;
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
}
@ -1401,15 +1361,15 @@ internal static unsafe class LosslessUtils
private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
Vector128<short> v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16());
Vector128<short> v2 = Sse2.Subtract(v1, c2Vec.AsInt16());
Vector128<byte> b = Sse2.PackUnsignedSaturate(v2, v2);
return Sse2.ConvertToUInt32(b.AsUInt32());
Vector128<byte> c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c2Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128<byte>.Zero);
Vector128<short> v1 = c0Vec.AsInt16() + c1Vec.AsInt16();
Vector128<short> v2 = v1 - c2Vec.AsInt16();
Vector128<byte> b = Vector128_.PackUnsignedSaturate(v2, v2);
return b.AsUInt32().ToScalar();
}
{
@ -1432,20 +1392,20 @@ internal static unsafe class LosslessUtils
private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
Vector128<byte> b0 = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
Vector128<short> avg = Sse2.Add(c1Vec.AsInt16(), c0Vec.AsInt16());
Vector128<short> a0 = Sse2.ShiftRightLogical(avg, 1);
Vector128<short> a1 = Sse2.Subtract(a0, b0.AsInt16());
Vector128<short> bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16());
Vector128<short> a2 = Sse2.Subtract(a1, bgta);
Vector128<short> a3 = Sse2.ShiftRightArithmetic(a2, 1);
Vector128<short> a4 = Sse2.Add(a0, a3).AsInt16();
Vector128<byte> a5 = Sse2.PackUnsignedSaturate(a4, a4);
return Sse2.ConvertToUInt32(a5.AsUInt32());
Vector128<byte> c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128<byte>.Zero);
Vector128<byte> c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128<byte>.Zero);
Vector128<byte> b0 = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128<byte>.Zero);
Vector128<short> avg = c1Vec.AsInt16() + c0Vec.AsInt16();
Vector128<short> a0 = Vector128.ShiftRightLogical(avg, 1);
Vector128<short> a1 = a0 - b0.AsInt16();
Vector128<short> bgta = Vector128.GreaterThan(b0.AsInt16(), a0.AsInt16());
Vector128<short> a2 = a1 - bgta;
Vector128<short> a3 = Vector128.ShiftRightArithmetic(a2, 1);
Vector128<short> a4 = (a0 + a3).AsInt16();
Vector128<byte> a5 = Vector128_.PackUnsignedSaturate(a4, a4);
return a5.AsUInt32().ToScalar();
}
{
@ -1475,23 +1435,23 @@ internal static unsafe class LosslessUtils
private static uint Select(uint a, uint b, uint c, Span<short> scratch)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
fixed (short* ptr = &MemoryMarshal.GetReference(scratch))
{
Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte();
Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte();
Vector128<byte> c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte();
Vector128<byte> ac0 = Sse2.SubtractSaturate(a0, c0);
Vector128<byte> ca0 = Sse2.SubtractSaturate(c0, a0);
Vector128<byte> bc0 = Sse2.SubtractSaturate(b0, c0);
Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0);
Vector128<byte> ac = Sse2.Or(ac0, ca0);
Vector128<byte> bc = Sse2.Or(bc0, cb0);
Vector128<byte> pa = Sse2.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
Vector128<byte> pb = Sse2.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
Sse2.Store((ushort*)ptr, diff);
Vector128<byte> a0 = Vector128.CreateScalar(a).AsByte();
Vector128<byte> b0 = Vector128.CreateScalar(b).AsByte();
Vector128<byte> c0 = Vector128.CreateScalar(c).AsByte();
Vector128<byte> ac0 = Vector128_.SubtractSaturate(a0, c0);
Vector128<byte> ca0 = Vector128_.SubtractSaturate(c0, a0);
Vector128<byte> bc0 = Vector128_.SubtractSaturate(b0, c0);
Vector128<byte> cb0 = Vector128_.SubtractSaturate(c0, b0);
Vector128<byte> ac = ac0 | ca0;
Vector128<byte> bc = bc0 | cb0;
Vector128<byte> pa = Vector128_.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
Vector128<byte> pb = Vector128_.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
Vector128<ushort> diff = pb.AsUInt16() - pa.AsUInt16();
diff.Store((ushort*)ptr);
int paMinusPb = ptr[3] + ptr[2] + ptr[1] + ptr[0];
return (paMinusPb <= 0) ? a : b;
}

1273
src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

File diff suppressed because it is too large

355
src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs

@ -2,10 +2,11 @@
// Licensed under the Six Labors Split License.
using System.Buffers.Binary;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Webp.Lossy;
@ -78,7 +79,7 @@ internal static unsafe class Vp8Encoding
// Does two inverse transforms.
public static void ITransformTwo(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
@ -116,10 +117,10 @@ internal static unsafe class Vp8Encoding
Vector128<long> inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
Vector128<long> inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);
in0 = Sse2.UnpackLow(in0, inb0);
in1 = Sse2.UnpackLow(in1, inb1);
in2 = Sse2.UnpackLow(in2, inb2);
in3 = Sse2.UnpackLow(in3, inb3);
in0 = Vector128_.UnpackLow(in0, inb0);
in1 = Vector128_.UnpackLow(in1, inb1);
in2 = Vector128_.UnpackLow(in2, inb2);
in3 = Vector128_.UnpackLow(in3, inb3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
@ -128,49 +129,45 @@ internal static unsafe class Vp8Encoding
// Vertical pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
// Horizontal pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
// Add inverse transform to 'ref' and store.
// Load the reference(s).
Vector128<byte> ref0 = Vector128<byte>.Zero;
Vector128<byte> ref1 = Vector128<byte>.Zero;
Vector128<byte> ref2 = Vector128<byte>.Zero;
Vector128<byte> ref3 = Vector128<byte>.Zero;
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
// Load eight bytes/pixels per line.
ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
Vector128<byte> ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
Vector128<byte> ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
Vector128<byte> ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
Vector128<byte> ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
// Convert to 16b.
ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
ref0 = Vector128_.UnpackLow(ref0, Vector128<byte>.Zero);
ref1 = Vector128_.UnpackLow(ref1, Vector128<byte>.Zero);
ref2 = Vector128_.UnpackLow(ref2, Vector128<byte>.Zero);
ref3 = Vector128_.UnpackLow(ref3, Vector128<byte>.Zero);
// Add the inverse transform(s).
Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
Vector128<short> ref0InvAdded = ref0.AsInt16() + t0.AsInt16();
Vector128<short> ref1InvAdded = ref1.AsInt16() + t1.AsInt16();
Vector128<short> ref2InvAdded = ref2.AsInt16() + t2.AsInt16();
Vector128<short> ref3InvAdded = ref3.AsInt16() + t3.AsInt16();
// Unsigned saturate to 8b.
ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
// Store eight bytes/pixels per line.
ref byte outputRef = ref MemoryMarshal.GetReference(dst);
@ -188,7 +185,7 @@ internal static unsafe class Vp8Encoding
public static void ITransformOne(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
// Load and concatenate the transform coefficients (we'll do two inverse
// transforms in parallel). In the case of only one inverse transform, the
@ -207,63 +204,59 @@ internal static unsafe class Vp8Encoding
// Vertical pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
// Horizontal pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
// Add inverse transform to 'ref' and store.
// Load the reference(s).
Vector128<byte> ref0 = Vector128<byte>.Zero;
Vector128<byte> ref1 = Vector128<byte>.Zero;
Vector128<byte> ref2 = Vector128<byte>.Zero;
Vector128<byte> ref3 = Vector128<byte>.Zero;
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
// Load four bytes/pixels per line.
ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
Vector128<byte> ref0 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref referenceRef)).AsByte();
Vector128<byte> ref1 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
Vector128<byte> ref2 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
Vector128<byte> ref3 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
// Convert to 16b.
ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
ref0 = Vector128_.UnpackLow(ref0, Vector128<byte>.Zero);
ref1 = Vector128_.UnpackLow(ref1, Vector128<byte>.Zero);
ref2 = Vector128_.UnpackLow(ref2, Vector128<byte>.Zero);
ref3 = Vector128_.UnpackLow(ref3, Vector128<byte>.Zero);
// Add the inverse transform(s).
Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
Vector128<short> ref0InvAdded = ref0.AsInt16() + t0.AsInt16();
Vector128<short> ref1InvAdded = ref1.AsInt16() + t1.AsInt16();
Vector128<short> ref2InvAdded = ref2.AsInt16() + t2.AsInt16();
Vector128<short> ref3InvAdded = ref3.AsInt16() + t3.AsInt16();
// Unsigned saturate to 8b.
ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
// Unsigned saturate to 8b.
ref byte outputRef = ref MemoryMarshal.GetReference(dst);
// Store four bytes/pixels per line.
int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
Unsafe.As<byte, int>(ref outputRef) = output0;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
int output0 = ref0.AsInt32().ToScalar();
int output1 = ref1.AsInt32().ToScalar();
int output2 = ref2.AsInt32().ToScalar();
int output3 = ref3.AsInt32().ToScalar();
Unsafe.WriteUnaligned(ref outputRef, output0);
Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps), output1);
Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2), output2);
Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3), output3);
}
else
{
@ -302,72 +295,72 @@ internal static unsafe class Vp8Encoding
}
}
private static void InverseTransformVerticalPass(Vector128<long> in0, Vector128<long> in2, Vector128<long> in1, Vector128<long> in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3)
private static void InverseTransformVerticalPassVector128(Vector128<long> in0, Vector128<long> in2, Vector128<long> in1, Vector128<long> in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3)
{
Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
Vector128<short> a = in0.AsInt16() + in2.AsInt16();
Vector128<short> b = in0.AsInt16() - in2.AsInt16();
Vector128<short> k1 = Vector128.Create((short)20091).AsInt16();
Vector128<short> k2 = Vector128.Create((short)-30068).AsInt16();
// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2);
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1);
Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
Vector128<short> c4 = Sse2.Subtract(c1, c2);
Vector128<short> c = Sse2.Add(c3, c4);
Vector128<short> c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2);
Vector128<short> c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1);
Vector128<short> c3 = in1.AsInt16() - in3.AsInt16();
Vector128<short> c4 = c1 - c2;
Vector128<short> c = c3 + c4;
// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1);
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2);
Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
Vector128<short> d4 = Sse2.Add(d1, d2);
Vector128<short> d = Sse2.Add(d3, d4);
Vector128<short> d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1);
Vector128<short> d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2);
Vector128<short> d3 = in1.AsInt16() + in3.AsInt16();
Vector128<short> d4 = d1 + d2;
Vector128<short> d = d3 + d4;
// Second pass.
tmp0 = Sse2.Add(a, d);
tmp1 = Sse2.Add(b, c);
tmp2 = Sse2.Subtract(b, c);
tmp3 = Sse2.Subtract(a, d);
tmp0 = a + d;
tmp1 = b + c;
tmp2 = b - c;
tmp3 = a - d;
}
private static void InverseTransformHorizontalPass(Vector128<long> t0, Vector128<long> t2, Vector128<long> t1, Vector128<long> t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3)
private static void InverseTransformHorizontalPassVector128(Vector128<long> t0, Vector128<long> t2, Vector128<long> t1, Vector128<long> t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3)
{
Vector128<short> dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4));
Vector128<short> a = Sse2.Add(dc, t2.AsInt16());
Vector128<short> b = Sse2.Subtract(dc, t2.AsInt16());
Vector128<short> dc = t0.AsInt16() + Vector128.Create((short)4);
Vector128<short> a = dc + t2.AsInt16();
Vector128<short> b = dc - t2.AsInt16();
Vector128<short> k1 = Vector128.Create((short)20091).AsInt16();
Vector128<short> k2 = Vector128.Create((short)-30068).AsInt16();
// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
Vector128<short> c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2);
Vector128<short> c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1);
Vector128<short> c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
Vector128<short> c4 = Sse2.Subtract(c1, c2);
Vector128<short> c = Sse2.Add(c3, c4);
Vector128<short> c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2);
Vector128<short> c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1);
Vector128<short> c3 = t1.AsInt16() - t3.AsInt16();
Vector128<short> c4 = c1 - c2;
Vector128<short> c = c3 + c4;
// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
Vector128<short> d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1);
Vector128<short> d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2);
Vector128<short> d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
Vector128<short> d4 = Sse2.Add(d1, d2);
Vector128<short> d = Sse2.Add(d3, d4);
Vector128<short> d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1);
Vector128<short> d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2);
Vector128<short> d3 = t1.AsInt16() + t3.AsInt16();
Vector128<short> d4 = d1 + d2;
Vector128<short> d = d3 + d4;
// Second pass.
Vector128<short> tmp0 = Sse2.Add(a, d);
Vector128<short> tmp1 = Sse2.Add(b, c);
Vector128<short> tmp2 = Sse2.Subtract(b, c);
Vector128<short> tmp3 = Sse2.Subtract(a, d);
shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
Vector128<short> tmp0 = a + d;
Vector128<short> tmp1 = b + c;
Vector128<short> tmp2 = b - c;
Vector128<short> tmp3 = a - d;
shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3);
shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3);
shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3);
shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3);
}
public static void FTransform2(Span<byte> src, Span<byte> reference, Span<short> output, Span<short> output2, Span<int> scratch)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
ref byte srcRef = ref MemoryMarshal.GetReference(src);
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
@ -385,38 +378,38 @@ internal static unsafe class Vp8Encoding
Vector128<long> ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0);
// Convert both to 16 bit.
Vector128<byte> srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow0 = Vector128_.UnpackLow(src0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow1 = Vector128_.UnpackLow(src1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow2 = Vector128_.UnpackLow(src2.AsByte(), Vector128<byte>.Zero);
Vector128<byte> srcLow3 = Vector128_.UnpackLow(src3.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow0 = Vector128_.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow1 = Vector128_.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow2 = Vector128_.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero);
Vector128<byte> refLow3 = Vector128_.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero);
// Compute difference. -> 00 01 02 03 00' 01' 02' 03'
Vector128<short> diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16());
Vector128<short> diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16());
Vector128<short> diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16());
Vector128<short> diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16());
Vector128<short> diff0 = srcLow0.AsInt16() - refLow0.AsInt16();
Vector128<short> diff1 = srcLow1.AsInt16() - refLow1.AsInt16();
Vector128<short> diff2 = srcLow2.AsInt16() - refLow2.AsInt16();
Vector128<short> diff3 = srcLow3.AsInt16() - refLow3.AsInt16();
// Unpack and shuffle.
// 00 01 02 03 0 0 0 0
// 10 11 12 13 0 0 0 0
// 20 21 22 23 0 0 0 0
// 30 31 32 33 0 0 0 0
Vector128<int> shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
Vector128<int> shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
Vector128<int> shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
Vector128<int> shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());
Vector128<int> shuf01l = Vector128_.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
Vector128<int> shuf23l = Vector128_.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
Vector128<int> shuf01h = Vector128_.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
Vector128<int> shuf23h = Vector128_.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());
// First pass.
FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l);
FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h);
FTransformPass1Vector128(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l);
FTransformPass1Vector128(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h);
// Second pass.
FTransformPass2SSE2(v01l, v32l, output);
FTransformPass2SSE2(v01h, v32h, output2);
FTransformPass2Vector128(v01l, v32l, output);
FTransformPass2Vector128(v01h, v32h, output2);
}
else
{
@ -427,7 +420,7 @@ internal static unsafe class Vp8Encoding
public static void FTransform(Span<byte> src, Span<byte> reference, Span<short> output, Span<int> scratch)
{
if (Sse2.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
ref byte srcRef = ref MemoryMarshal.GetReference(src);
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
@ -449,29 +442,29 @@ internal static unsafe class Vp8Encoding
// 20 21 22 23 *
// 30 31 32 33 *
// Shuffle.
Vector128<short> srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16());
Vector128<short> srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16());
Vector128<short> refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
Vector128<short> refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16());
Vector128<short> srcLow0 = Vector128_.UnpackLow(src0.AsInt16(), src1.AsInt16());
Vector128<short> srcLow1 = Vector128_.UnpackLow(src2.AsInt16(), src3.AsInt16());
Vector128<short> refLow0 = Vector128_.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
Vector128<short> refLow1 = Vector128_.UnpackLow(ref2.AsInt16(), ref3.AsInt16());
// 00 01 10 11 02 03 12 13 * * ...
// 20 21 30 31 22 22 32 33 * * ...
// Convert both to 16 bit.
Vector128<byte> src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> src0_16b = Vector128_.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> src1_16b = Vector128_.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero);
Vector128<byte> ref0_16b = Vector128_.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero);
Vector128<byte> ref1_16b = Vector128_.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero);
// Compute the difference.
Vector128<short> row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16());
Vector128<short> row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16());
Vector128<short> row01 = src0_16b.AsInt16() - ref0_16b.AsInt16();
Vector128<short> row23 = src1_16b.AsInt16() - ref1_16b.AsInt16();
// First pass.
FTransformPass1SSE2(row01, row23, out Vector128<int> v01, out Vector128<int> v32);
FTransformPass1Vector128(row01, row23, out Vector128<int> v01, out Vector128<int> v32);
// Second pass.
FTransformPass2SSE2(v01, v32, output);
FTransformPass2Vector128(v01, v32, output);
}
else
{
@ -517,88 +510,88 @@ internal static unsafe class Vp8Encoding
}
}
public static void FTransformPass1SSE2(Vector128<short> row01, Vector128<short> row23, out Vector128<int> out01, out Vector128<int> out32)
public static void FTransformPass1Vector128(Vector128<short> row01, Vector128<short> row23, out Vector128<int> out01, out Vector128<int> out32)
{
// *in01 = 00 01 10 11 02 03 12 13
// *in23 = 20 21 30 31 22 23 32 33
Vector128<short> shuf01_p = Sse2.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301);
Vector128<short> shuf32_p = Sse2.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301);
Vector128<short> shuf01_p = Vector128_.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301);
Vector128<short> shuf32_p = Vector128_.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301);
// 00 01 10 11 03 02 13 12
// 20 21 30 31 23 22 33 32
Vector128<long> s01 = Sse2.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64());
Vector128<long> s32 = Sse2.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64());
Vector128<long> s01 = Vector128_.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64());
Vector128<long> s32 = Vector128_.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64());
// 00 01 10 11 20 21 30 31
// 03 02 13 12 23 22 33 32
Vector128<short> a01 = Sse2.Add(s01.AsInt16(), s32.AsInt16());
Vector128<short> a32 = Sse2.Subtract(s01.AsInt16(), s32.AsInt16());
Vector128<short> a01 = s01.AsInt16() + s32.AsInt16();
Vector128<short> a32 = s01.AsInt16() - s32.AsInt16();
// [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
// [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
// [ (a0 + a1) << 3, ... ]
Vector128<int> tmp0 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p
Vector128<int> tmp0 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p
// [ (a0 - a1) << 3, ... ]
Vector128<int> tmp2 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16()); // K88m
Vector128<int> tmp11 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16()); // K5352_2217p
Vector128<int> tmp31 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16()); // K5352_2217m
Vector128<int> tmp12 = Sse2.Add(tmp11, Vector128.Create(1812));
Vector128<int> tmp32 = Sse2.Add(tmp31, Vector128.Create(937));
Vector128<int> tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9);
Vector128<int> tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9);
Vector128<short> s03 = Sse2.PackSignedSaturate(tmp0, tmp2);
Vector128<short> s12 = Sse2.PackSignedSaturate(tmp1, tmp3);
Vector128<short> slo = Sse2.UnpackLow(s03, s12); // 0 1 0 1 0 1...
Vector128<short> shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3
Vector128<int> v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32());
out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32());
out32 = Sse2.Shuffle(v23, SimdUtils.Shuffle.MMShuffle1032);
Vector128<int> tmp2 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16()); // K88m
Vector128<int> tmp11 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16()); // K5352_2217p
Vector128<int> tmp31 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16()); // K5352_2217m
Vector128<int> tmp12 = tmp11 + Vector128.Create(1812);
Vector128<int> tmp32 = tmp31 + Vector128.Create(937);
Vector128<int> tmp1 = Vector128.ShiftRightArithmetic(tmp12, 9);
Vector128<int> tmp3 = Vector128.ShiftRightArithmetic(tmp32, 9);
Vector128<short> s03 = Vector128_.PackSignedSaturate(tmp0, tmp2);
Vector128<short> s12 = Vector128_.PackSignedSaturate(tmp1, tmp3);
Vector128<short> slo = Vector128_.UnpackLow(s03, s12); // 0 1 0 1 0 1...
Vector128<short> shi = Vector128_.UnpackHigh(s03, s12); // 2 3 2 3 2 3
Vector128<int> v23 = Vector128_.UnpackHigh(slo.AsInt32(), shi.AsInt32());
out01 = Vector128_.UnpackLow(slo.AsInt32(), shi.AsInt32());
out32 = Vector128_.ShuffleNative(v23, SimdUtils.Shuffle.MMShuffle1032);
}
public static void FTransformPass2SSE2(Vector128<int> v01, Vector128<int> v32, Span<short> output)
public static void FTransformPass2Vector128(Vector128<int> v01, Vector128<int> v32, Span<short> output)
{
// Same operations are done on the (0,3) and (1,2) pairs.
// a3 = v0 - v3
// a2 = v1 - v2
Vector128<short> a32 = Sse2.Subtract(v01.AsInt16(), v32.AsInt16());
Vector128<long> a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64());
Vector128<short> a32 = v01.AsInt16() - v32.AsInt16();
Vector128<long> a22 = Vector128_.UnpackHigh(a32.AsInt64(), a32.AsInt64());
Vector128<short> b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16());
Vector128<int> c1 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16()); // K5352_2217
Vector128<int> c3 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16()); // K2217_5352
Vector128<int> d1 = Sse2.Add(c1, Vector128.Create(12000 + (1 << 16))); // K12000PlusOne
Vector128<int> d3 = Sse2.Add(c3, Vector128.Create(51000));
Vector128<int> e1 = Sse2.ShiftRightArithmetic(d1, 16);
Vector128<int> e3 = Sse2.ShiftRightArithmetic(d3, 16);
Vector128<short> b23 = Vector128_.UnpackLow(a22.AsInt16(), a32.AsInt16());
Vector128<int> c1 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16()); // K5352_2217
Vector128<int> c3 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16()); // K2217_5352
Vector128<int> d1 = c1 + Vector128.Create(12000 + (1 << 16)); // K12000PlusOne
Vector128<int> d3 = c3 + Vector128.Create(51000);
Vector128<int> e1 = Vector128.ShiftRightArithmetic(d1, 16);
Vector128<int> e3 = Vector128.ShiftRightArithmetic(d3, 16);
// f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
// f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
Vector128<short> f1 = Sse2.PackSignedSaturate(e1, e1);
Vector128<short> f3 = Sse2.PackSignedSaturate(e3, e3);
Vector128<short> f1 = Vector128_.PackSignedSaturate(e1, e1);
Vector128<short> f3 = Vector128_.PackSignedSaturate(e3, e3);
// g1 = f1 + (a3 != 0);
// The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
// desired (0, 1), we add one earlier through k12000_plus_one.
// -> g1 = f1 + 1 - (a3 == 0)
Vector128<short> g1 = Sse2.Add(f1, Sse2.CompareEqual(a32, Vector128<short>.Zero));
Vector128<short> g1 = f1 + Vector128.Equals(a32, Vector128<short>.Zero);
// a0 = v0 + v3
// a1 = v1 + v2
Vector128<short> a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16());
Vector128<short> a01Plus7 = Sse2.Add(a01.AsInt16(), Vector128.Create((short)7));
Vector128<short> a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
Vector128<short> c0 = Sse2.Add(a01Plus7, a11);
Vector128<short> c2 = Sse2.Subtract(a01Plus7, a11);
Vector128<short> a01 = v01.AsInt16() + v32.AsInt16();
Vector128<short> a01Plus7 = a01.AsInt16() + Vector128.Create((short)7);
Vector128<short> a11 = Vector128_.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
Vector128<short> c0 = a01Plus7 + a11;
Vector128<short> c2 = a01Plus7 - a11;
// d0 = (a0 + a1 + 7) >> 4;
// d2 = (a0 - a1 + 7) >> 4;
Vector128<short> d0 = Sse2.ShiftRightArithmetic(c0, 4);
Vector128<short> d2 = Sse2.ShiftRightArithmetic(c2, 4);
Vector128<short> d0 = Vector128.ShiftRightArithmetic(c0, 4);
Vector128<short> d2 = Vector128.ShiftRightArithmetic(c2, 4);
Vector128<long> d0g1 = Sse2.UnpackLow(d0.AsInt64(), g1.AsInt64());
Vector128<long> d2f3 = Sse2.UnpackLow(d2.AsInt64(), f3.AsInt64());
Vector128<long> d0g1 = Vector128_.UnpackLow(d0.AsInt64(), g1.AsInt64());
Vector128<long> d2f3 = Vector128_.UnpackLow(d2.AsInt64(), f3.AsInt64());
ref short outputRef = ref MemoryMarshal.GetReference(output);
Unsafe.As<short, Vector128<short>>(ref outputRef) = d0g1.AsInt16();

209
src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs

@ -5,7 +5,7 @@ using System.Buffers;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
@ -29,9 +29,9 @@ internal static class YuvConversion
// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
public static void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
{
if (Sse41.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
UpSampleVector128(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
}
else
{
@ -107,7 +107,7 @@ internal static class YuvConversion
//
// Then m can be written as
// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
private static void UpSampleSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
private static void UpSampleVector128(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
{
const int xStep = 3;
Array.Clear(uvBuffer);
@ -138,18 +138,18 @@ internal static class YuvConversion
{
for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
{
UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
ConvertYuvToBgrWithBottomYVector128(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
}
}
else
{
for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
{
UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep);
UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
ConvertYuvToBgrVector128(topY, topDst, ru, rv, pos, xStep);
}
}
@ -161,18 +161,18 @@ internal static class YuvConversion
Span<byte> tmpBottomDst = tmpTopDst[(4 * 32)..];
Span<byte> tmpTop = tmpBottomDst[(4 * 32)..];
Span<byte> tmpBottom = bottomY.IsEmpty ? null : tmpTop[32..];
UpSampleLastBlock(topU[uvPos..], curU[uvPos..], leftOver, ru);
UpSampleLastBlock(topV[uvPos..], curV[uvPos..], leftOver, rv);
UpSampleLastBlockVector128(topU[uvPos..], curU[uvPos..], leftOver, ru);
UpSampleLastBlockVector128(topV[uvPos..], curV[uvPos..], leftOver, rv);
topY[pos..len].CopyTo(tmpTop);
if (!bottomY.IsEmpty)
{
bottomY[pos..len].CopyTo(tmpBottom);
ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
ConvertYuvToBgrWithBottomYVector128(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
}
else
{
ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep);
ConvertYuvToBgrVector128(tmpTop, tmpTopDst, ru, rv, 0, xStep);
}
tmpTopDst[..((len - pos) * xStep)].CopyTo(topDst[(pos * xStep)..]);
@ -184,7 +184,7 @@ internal static class YuvConversion
}
// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
private static void UpSample32Pixels(ref byte r1, ref byte r2, Span<byte> output)
private static void UpSample32PixelsVector128(ref byte r1, ref byte r2, Span<byte> output)
{
// Load inputs.
Vector128<byte> a = Unsafe.As<byte, Vector128<byte>>(ref r1);
@ -192,28 +192,28 @@ internal static class YuvConversion
Vector128<byte> c = Unsafe.As<byte, Vector128<byte>>(ref r2);
Vector128<byte> d = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref r2, 1));
Vector128<byte> s = Sse2.Average(a, d); // s = (a + d + 1) / 2
Vector128<byte> t = Sse2.Average(b, c); // t = (b + c + 1) / 2
Vector128<byte> st = Sse2.Xor(s, t); // st = s^t
Vector128<byte> s = Vector128_.Average(a, d); // s = (a + d + 1) / 2
Vector128<byte> t = Vector128_.Average(b, c); // t = (b + c + 1) / 2
Vector128<byte> st = s ^ t; // st = s^t
Vector128<byte> ad = Sse2.Xor(a, d); // ad = a^d
Vector128<byte> bc = Sse2.Xor(b, c); // bc = b^c
Vector128<byte> ad = a ^ d; // ad = a^d
Vector128<byte> bc = b ^ c; // bc = b^c
Vector128<byte> t1 = Sse2.Or(ad, bc); // (a^d) | (b^c)
Vector128<byte> t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t)
Vector128<byte> t3 = Sse2.And(t2, Vector128.Create((byte)1)); // (a^d) | (b^c) | (s^t) & 1
Vector128<byte> t4 = Sse2.Average(s, t);
Vector128<byte> k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4
Vector128<byte> t1 = ad | bc; // (a^d) | (b^c)
Vector128<byte> t2 = t1 | st; // (a^d) | (b^c) | (s^t)
Vector128<byte> t3 = t2 & Vector128.Create((byte)1); // (a^d) | (b^c) | (s^t) & 1
Vector128<byte> t4 = Vector128_.Average(s, t);
Vector128<byte> k = t4 - t3; // k = (a + b + c + d) / 4
Vector128<byte> diag1 = GetM(k, st, bc, t);
Vector128<byte> diag2 = GetM(k, st, ad, s);
Vector128<byte> diag1 = GetMVector128(k, st, bc, t);
Vector128<byte> diag2 = GetMVector128(k, st, ad, s);
// Pack the alternate pixels.
PackAndStore(a, b, diag1, diag2, output); // store top.
PackAndStore(c, d, diag2, diag1, output[(2 * 32)..]);
PackAndStoreVector128(a, b, diag1, diag2, output); // store top.
PackAndStoreVector128(c, d, diag2, diag1, output[(2 * 32)..]);
}
private static void UpSampleLastBlock(Span<byte> tb, Span<byte> bb, int numPixels, Span<byte> output)
private static void UpSampleLastBlockVector128(Span<byte> tb, Span<byte> bb, int numPixels, Span<byte> output)
{
Span<byte> r1 = stackalloc byte[17];
Span<byte> r2 = stackalloc byte[17];
@ -230,27 +230,27 @@ internal static class YuvConversion
ref byte r1Ref = ref MemoryMarshal.GetReference(r1);
ref byte r2Ref = ref MemoryMarshal.GetReference(r2);
UpSample32Pixels(ref r1Ref, ref r2Ref, output);
UpSample32PixelsVector128(ref r1Ref, ref r2Ref, output);
}
// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
private static Vector128<byte> GetM(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
private static Vector128<byte> GetMVector128(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
{
Vector128<byte> tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2
Vector128<byte> tmp1 = Sse2.And(ij, st); // (ij) & (s^t)
Vector128<byte> tmp2 = Sse2.Xor(k, input); // (k^in)
Vector128<byte> tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in)
Vector128<byte> tmp4 = Sse2.And(tmp3, Vector128.Create((byte)1)); // & 1 -> lsb_correction
Vector128<byte> tmp0 = Vector128_.Average(k, input); // (k + in + 1) / 2
Vector128<byte> tmp1 = ij & st; // (ij) & (s^t)
Vector128<byte> tmp2 = k ^ input; // (k^in)
Vector128<byte> tmp3 = tmp1 | tmp2; // ((ij) & (s^t)) | (k^in)
Vector128<byte> tmp4 = tmp3 & Vector128.Create((byte)1); // & 1 -> lsb_correction
return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction
return tmp0 - tmp4; // (k + in + 1) / 2 - lsb_correction
}
private static void PackAndStore(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
private static void PackAndStoreVector128(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
{
Vector128<byte> ta = Sse2.Average(a, da); // (9a + 3b + 3c + d + 8) / 16
Vector128<byte> tb = Sse2.Average(b, db); // (3a + 9b + c + 3d + 8) / 16
Vector128<byte> t1 = Sse2.UnpackLow(ta, tb);
Vector128<byte> t2 = Sse2.UnpackHigh(ta, tb);
Vector128<byte> ta = Vector128_.Average(a, da); // (9a + 3b + 3c + d + 8) / 16
Vector128<byte> tb = Vector128_.Average(b, db); // (3a + 9b + c + 3d + 8) / 16
Vector128<byte> t1 = Vector128_.UnpackLow(ta, tb);
Vector128<byte> t2 = Vector128_.UnpackHigh(ta, tb);
ref byte output0Ref = ref MemoryMarshal.GetReference(output);
ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16);
@ -562,41 +562,42 @@ internal static class YuvConversion
}
[MethodImpl(InliningOptions.ShortMethod)]
private static void ConvertYuvToBgrSse41(Span<byte> topY, Span<byte> topDst, Span<byte> ru, Span<byte> rv, int curX, int step) => YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
private static void ConvertYuvToBgrVector128(Span<byte> topY, Span<byte> topDst, Span<byte> ru, Span<byte> rv, int curX, int step)
=> YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);
[MethodImpl(InliningOptions.ShortMethod)]
private static void ConvertYuvToBgrWithBottomYSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
private static void ConvertYuvToBgrWithBottomYVector128(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
{
YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
YuvToBgrSse41(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);
YuvToBgrVector128(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
}
private static void YuvToBgrSse41(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
private static void YuvToBgrVector128(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
{
ref byte yRef = ref MemoryMarshal.GetReference(y);
ref byte uRef = ref MemoryMarshal.GetReference(u);
ref byte vRef = ref MemoryMarshal.GetReference(v);
ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);
ConvertYuv444ToBgrVector128(ref yRef, ref uRef, ref vRef, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);
// Cast to 8b and store as BBBBGGGGRRRR.
Vector128<byte> bgr0 = Sse2.PackUnsignedSaturate(b0, b1);
Vector128<byte> bgr1 = Sse2.PackUnsignedSaturate(b2, b3);
Vector128<byte> bgr2 = Sse2.PackUnsignedSaturate(g0, g1);
Vector128<byte> bgr3 = Sse2.PackUnsignedSaturate(g2, g3);
Vector128<byte> bgr4 = Sse2.PackUnsignedSaturate(r0, r1);
Vector128<byte> bgr5 = Sse2.PackUnsignedSaturate(r2, r3);
Vector128<byte> bgr0 = Vector128_.PackUnsignedSaturate(b0, b1);
Vector128<byte> bgr1 = Vector128_.PackUnsignedSaturate(b2, b3);
Vector128<byte> bgr2 = Vector128_.PackUnsignedSaturate(g0, g1);
Vector128<byte> bgr3 = Vector128_.PackUnsignedSaturate(g2, g3);
Vector128<byte> bgr4 = Vector128_.PackUnsignedSaturate(r0, r1);
Vector128<byte> bgr5 = Vector128_.PackUnsignedSaturate(r2, r3);
// Pack as BGRBGRBGRBGR.
PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
PlanarTo24bVector128(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
}
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
private static void PlanarTo24bSse41(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
private static void PlanarTo24bVector128(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
{
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
@ -612,7 +613,7 @@ internal static class YuvConversion
// r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
// Process R.
ChannelMixing(
ChannelMixingVector128(
input0,
input1,
Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5), // PlanarTo24Shuffle0
@ -627,7 +628,7 @@ internal static class YuvConversion
// Process G.
// Same as before, just shifted to the left by one and including the right padding.
ChannelMixing(
ChannelMixingVector128(
input2,
input3,
Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255), // PlanarTo24Shuffle3
@ -641,7 +642,7 @@ internal static class YuvConversion
out Vector128<byte> g5);
// Process B.
ChannelMixing(
ChannelMixingVector128(
input4,
input5,
Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255), // PlanarTo24Shuffle6
@ -655,24 +656,24 @@ internal static class YuvConversion
out Vector128<byte> b5);
// OR the different channels.
Vector128<byte> rg0 = Sse2.Or(r0, g0);
Vector128<byte> rg1 = Sse2.Or(r1, g1);
Vector128<byte> rg2 = Sse2.Or(r2, g2);
Vector128<byte> rg3 = Sse2.Or(r3, g3);
Vector128<byte> rg4 = Sse2.Or(r4, g4);
Vector128<byte> rg5 = Sse2.Or(r5, g5);
Vector128<byte> rg0 = r0 | g0;
Vector128<byte> rg1 = r1 | g1;
Vector128<byte> rg2 = r2 | g2;
Vector128<byte> rg3 = r3 | g3;
Vector128<byte> rg4 = r4 | g4;
Vector128<byte> rg5 = r5 | g5;
ref byte outputRef = ref MemoryMarshal.GetReference(rgb);
Unsafe.As<byte, Vector128<byte>>(ref outputRef) = Sse2.Or(rg0, b0);
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1);
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2);
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3);
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4);
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5);
Unsafe.As<byte, Vector128<byte>>(ref outputRef) = rg0 | b0;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = rg1 | b1;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = rg2 | b2;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = rg3 | b3;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = rg4 | b4;
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = rg5 | b5;
}
// Shuffles the input buffer as A0 0 0 A1 0 0 A2
private static void ChannelMixing(
private static void ChannelMixingVector128(
Vector128<byte> input0,
Vector128<byte> input1,
Vector128<byte> shuffle0,
@ -685,53 +686,53 @@ internal static class YuvConversion
out Vector128<byte> output4,
out Vector128<byte> output5)
{
output0 = Ssse3.Shuffle(input0, shuffle0);
output1 = Ssse3.Shuffle(input0, shuffle1);
output2 = Ssse3.Shuffle(input0, shuffle2);
output3 = Ssse3.Shuffle(input1, shuffle0);
output4 = Ssse3.Shuffle(input1, shuffle1);
output5 = Ssse3.Shuffle(input1, shuffle2);
output0 = Vector128_.ShuffleNative(input0, shuffle0);
output1 = Vector128_.ShuffleNative(input0, shuffle1);
output2 = Vector128_.ShuffleNative(input0, shuffle2);
output3 = Vector128_.ShuffleNative(input1, shuffle0);
output4 = Vector128_.ShuffleNative(input1, shuffle1);
output5 = Vector128_.ShuffleNative(input1, shuffle2);
}
// Convert 32 samples of YUV444 to B/G/R
private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
private static void ConvertYuv444ToBgrVector128(ref byte y, ref byte u, ref byte v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
{
// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
Vector128<byte> y0 = Unsafe.As<byte, Vector128<byte>>(ref y);
Vector128<byte> u0 = Unsafe.As<byte, Vector128<byte>>(ref u);
Vector128<byte> v0 = Unsafe.As<byte, Vector128<byte>>(ref v);
y0 = Sse2.UnpackLow(Vector128<byte>.Zero, y0);
u0 = Sse2.UnpackLow(Vector128<byte>.Zero, u0);
v0 = Sse2.UnpackLow(Vector128<byte>.Zero, v0);
y0 = Vector128_.UnpackLow(Vector128<byte>.Zero, y0);
u0 = Vector128_.UnpackLow(Vector128<byte>.Zero, u0);
v0 = Vector128_.UnpackLow(Vector128<byte>.Zero, v0);
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
// R = (19077 * y + 26149 * v - 14234) >> 6
// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
// B = (19077 * y + 33050 * u - 17685) >> 6
var k19077 = Vector128.Create((ushort)19077);
var k26149 = Vector128.Create((ushort)26149);
var k14234 = Vector128.Create((ushort)14234);
Vector128<ushort> k19077 = Vector128.Create((ushort)19077);
Vector128<ushort> k26149 = Vector128.Create((ushort)26149);
Vector128<ushort> k14234 = Vector128.Create((ushort)14234);
Vector128<ushort> y1 = Sse2.MultiplyHigh(y0.AsUInt16(), k19077);
Vector128<ushort> r0 = Sse2.MultiplyHigh(v0.AsUInt16(), k26149);
Vector128<ushort> g0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419));
Vector128<ushort> g1 = Sse2.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320));
Vector128<ushort> y1 = Vector128_.MultiplyHigh(y0.AsUInt16(), k19077);
Vector128<ushort> r0 = Vector128_.MultiplyHigh(v0.AsUInt16(), k26149);
Vector128<ushort> g0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419));
Vector128<ushort> g1 = Vector128_.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320));
Vector128<ushort> r1 = Sse2.Subtract(y1.AsUInt16(), k14234);
Vector128<ushort> r2 = Sse2.Add(r1, r0);
Vector128<ushort> r1 = y1.AsUInt16() - k14234;
Vector128<ushort> r2 = r1 + r0;
Vector128<ushort> g2 = Sse2.Add(y1.AsUInt16(), Vector128.Create((ushort)8708));
Vector128<ushort> g3 = Sse2.Add(g0, g1);
Vector128<ushort> g4 = Sse2.Subtract(g2, g3);
Vector128<ushort> g2 = y1.AsUInt16() + Vector128.Create((ushort)8708);
Vector128<ushort> g3 = g0 + g1;
Vector128<ushort> g4 = g2 - g3;
Vector128<ushort> b0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16());
Vector128<ushort> b1 = Sse2.AddSaturate(b0, y1);
Vector128<ushort> b2 = Sse2.SubtractSaturate(b1, Vector128.Create((ushort)17685));
Vector128<ushort> b0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16());
Vector128<ushort> b1 = Vector128_.AddSaturate(b0, y1);
Vector128<ushort> b2 = Vector128_.SubtractSaturate(b1, Vector128.Create((ushort)17685));
// Use logical shift for B2, which can be larger than 32767.
r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
r = Vector128.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
g = Vector128.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
b = Vector128.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
}
[MethodImpl(InliningOptions.ShortMethod)]

88
src/ImageSharp/Formats/Webp/WebpCommonUtils.cs

@ -3,7 +3,7 @@
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Formats.Webp;
@ -20,7 +20,7 @@ internal static class WebpCommonUtils
/// <returns>Returns true if alpha has non-0xff values.</returns>
public static unsafe bool CheckNonOpaque(ReadOnlySpan<Bgra32> row)
{
if (Avx2.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
ReadOnlySpan<byte> rowBytes = MemoryMarshal.AsBytes(row);
int i = 0;
@ -32,20 +32,20 @@ internal static class WebpCommonUtils
for (; i + 128 <= length; i += 128)
{
Vector256<byte> a0 = Avx.LoadVector256(src + i).AsByte();
Vector256<byte> a1 = Avx.LoadVector256(src + i + 32).AsByte();
Vector256<byte> a2 = Avx.LoadVector256(src + i + 64).AsByte();
Vector256<byte> a3 = Avx.LoadVector256(src + i + 96).AsByte();
Vector256<int> b0 = Avx2.And(a0, alphaMaskVector256).AsInt32();
Vector256<int> b1 = Avx2.And(a1, alphaMaskVector256).AsInt32();
Vector256<int> b2 = Avx2.And(a2, alphaMaskVector256).AsInt32();
Vector256<int> b3 = Avx2.And(a3, alphaMaskVector256).AsInt32();
Vector256<short> c0 = Avx2.PackSignedSaturate(b0, b1).AsInt16();
Vector256<short> c1 = Avx2.PackSignedSaturate(b2, b3).AsInt16();
Vector256<byte> d = Avx2.PackSignedSaturate(c0, c1).AsByte();
Vector256<byte> bits = Avx2.CompareEqual(d, all0x80Vector256);
int mask = Avx2.MoveMask(bits);
if (mask != -1)
Vector256<byte> a0 = Vector256.Load(src + i).AsByte();
Vector256<byte> a1 = Vector256.Load(src + i + 32).AsByte();
Vector256<byte> a2 = Vector256.Load(src + i + 64).AsByte();
Vector256<byte> a3 = Vector256.Load(src + i + 96).AsByte();
Vector256<int> b0 = (a0 & alphaMaskVector256).AsInt32();
Vector256<int> b1 = (a1 & alphaMaskVector256).AsInt32();
Vector256<int> b2 = (a2 & alphaMaskVector256).AsInt32();
Vector256<int> b3 = (a3 & alphaMaskVector256).AsInt32();
Vector256<short> c0 = Vector256_.PackSignedSaturate(b0, b1).AsInt16();
Vector256<short> c1 = Vector256_.PackSignedSaturate(b2, b3).AsInt16();
Vector256<byte> d = Vector256_.PackSignedSaturate(c0, c1).AsByte();
Vector256<byte> bits = Vector256.Equals(d, all0x80Vector256);
uint mask = bits.ExtractMostSignificantBits();
if (mask != 0xFFFF_FFFF)
{
return true;
}
@ -53,7 +53,7 @@ internal static class WebpCommonUtils
for (; i + 64 <= length; i += 64)
{
if (IsNoneOpaque64Bytes(src, i))
if (IsNoneOpaque64BytesVector128(src, i))
{
return true;
}
@ -61,7 +61,7 @@ internal static class WebpCommonUtils
for (; i + 32 <= length; i += 32)
{
if (IsNoneOpaque32Bytes(src, i))
if (IsNonOpaque32BytesVector128(src, i))
{
return true;
}
@ -76,7 +76,7 @@ internal static class WebpCommonUtils
}
}
}
else if (Sse2.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
ReadOnlySpan<byte> rowBytes = MemoryMarshal.AsBytes(row);
int i = 0;
@ -85,7 +85,7 @@ internal static class WebpCommonUtils
{
for (; i + 64 <= length; i += 64)
{
if (IsNoneOpaque64Bytes(src, i))
if (IsNoneOpaque64BytesVector128(src, i))
{
return true;
}
@ -93,7 +93,7 @@ internal static class WebpCommonUtils
for (; i + 32 <= length; i += 32)
{
if (IsNoneOpaque32Bytes(src, i))
if (IsNonOpaque32BytesVector128(src, i))
{
return true;
}
@ -122,38 +122,38 @@ internal static class WebpCommonUtils
return false;
}
private static unsafe bool IsNoneOpaque64Bytes(byte* src, int i)
private static unsafe bool IsNoneOpaque64BytesVector128(byte* src, int i)
{
Vector128<byte> alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
Vector128<byte> a2 = Sse2.LoadVector128(src + i + 32).AsByte();
Vector128<byte> a3 = Sse2.LoadVector128(src + i + 48).AsByte();
Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
Vector128<int> b2 = Sse2.And(a2, alphaMask).AsInt32();
Vector128<int> b3 = Sse2.And(a3, alphaMask).AsInt32();
Vector128<short> c0 = Sse2.PackSignedSaturate(b0, b1).AsInt16();
Vector128<short> c1 = Sse2.PackSignedSaturate(b2, b3).AsInt16();
Vector128<byte> d = Sse2.PackSignedSaturate(c0, c1).AsByte();
Vector128<byte> bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte());
int mask = Sse2.MoveMask(bits);
Vector128<byte> a0 = Vector128.Load(src + i).AsByte();
Vector128<byte> a1 = Vector128.Load(src + i + 16).AsByte();
Vector128<byte> a2 = Vector128.Load(src + i + 32).AsByte();
Vector128<byte> a3 = Vector128.Load(src + i + 48).AsByte();
Vector128<int> b0 = (a0 & alphaMask).AsInt32();
Vector128<int> b1 = (a1 & alphaMask).AsInt32();
Vector128<int> b2 = (a2 & alphaMask).AsInt32();
Vector128<int> b3 = (a3 & alphaMask).AsInt32();
Vector128<short> c0 = Vector128_.PackSignedSaturate(b0, b1).AsInt16();
Vector128<short> c1 = Vector128_.PackSignedSaturate(b2, b3).AsInt16();
Vector128<byte> d = Vector128_.PackSignedSaturate(c0, c1).AsByte();
Vector128<byte> bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte());
uint mask = bits.ExtractMostSignificantBits();
return mask != 0xFFFF;
}
private static unsafe bool IsNoneOpaque32Bytes(byte* src, int i)
private static unsafe bool IsNonOpaque32BytesVector128(byte* src, int i)
{
Vector128<byte> alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
Vector128<short> c = Sse2.PackSignedSaturate(b0, b1).AsInt16();
Vector128<byte> d = Sse2.PackSignedSaturate(c, c).AsByte();
Vector128<byte> bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte());
int mask = Sse2.MoveMask(bits);
Vector128<byte> a0 = Vector128.Load(src + i).AsByte();
Vector128<byte> a1 = Vector128.Load(src + i + 16).AsByte();
Vector128<int> b0 = (a0 & alphaMask).AsInt32();
Vector128<int> b1 = (a1 & alphaMask).AsInt32();
Vector128<short> c = Vector128_.PackSignedSaturate(b0, b1).AsInt16();
Vector128<byte> d = Vector128_.PackSignedSaturate(c, c).AsByte();
Vector128<byte> bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte());
uint mask = bits.ExtractMostSignificantBits();
return mask != 0xFFFF;
}
}

1
src/ImageSharp/Metadata/Profiles/IPTC/IptcProfile.cs

@ -3,7 +3,6 @@
using System.Buffers.Binary;
using System.Collections.ObjectModel;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Text;
using SixLabors.ImageSharp.Metadata.Profiles.IPTC;

2
src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs

@ -210,6 +210,8 @@ public partial class PixelOperations<TPixel>
{
GuardUnpackIntoRgbPlanes(redChannel, greenChannel, blueChannel, source);
// TODO: This can be much faster.
// Convert to Rgba32 first using pixel operations then use the R, G, B properties.
int count = source.Length;
ref float r = ref MemoryMarshal.GetReference(redChannel);

32
tests/ImageSharp.Tests/Formats/Tiff/TiffDecoderTests.cs

@ -341,16 +341,46 @@ public class TiffDecoderTests : TiffDecoderBaseTester
[Theory]
[WithFile(Cmyk, PixelTypes.Rgba32)]
[WithFile(CmykLzwPredictor, PixelTypes.Rgba32)]
[WithFile(CmykJpeg, PixelTypes.Rgba32)]
public void TiffDecoder_CanDecode_Cmyk<TPixel>(TestImageProvider<TPixel> provider)
where TPixel : unmanaged, IPixel<TPixel>
{
// Note: The image from MagickReferenceDecoder does not look right, maybe we are doing something wrong
// converting the pixel data from Magick.NET to our format with CMYK?
using Image<TPixel> image = provider.GetImage();
using Image<TPixel> image = provider.GetImage(TiffDecoder.Instance);
image.DebugSave(provider);
image.CompareToReferenceOutput(ImageComparer.Exact, provider);
}
[Theory]
[WithFile(Issues2454_A, PixelTypes.Rgba32)]
[WithFile(Issues2454_B, PixelTypes.Rgba32)]
public void TiffDecoder_CanDecode_YccK<TPixel>(TestImageProvider<TPixel> provider)
where TPixel : unmanaged, IPixel<TPixel>
{
using Image<TPixel> image = provider.GetImage(TiffDecoder.Instance);
image.DebugSave(provider);
image.CompareToReferenceOutput(ImageComparer.Exact, provider);
}
[Theory]
[WithFile(Issues2454_A, PixelTypes.Rgba32)]
[WithFile(Issues2454_B, PixelTypes.Rgba32)]
public void TiffDecoder_CanDecode_YccK_ICC<TPixel>(TestImageProvider<TPixel> provider)
where TPixel : unmanaged, IPixel<TPixel>
{
DecoderOptions options = new()
{
ColorProfileHandling = ColorProfileHandling.Convert,
};
using Image<TPixel> image = provider.GetImage(TiffDecoder.Instance, options);
image.DebugSave(provider);
// Linux reports a 0.0000% difference, so we use a tolerant comparer here.
image.CompareToReferenceOutput(ImageComparer.TolerantPercentage(0.0001F), provider);
}
[Theory]
[WithFile(FlowerRgb101010Contiguous, PixelTypes.Rgba32)]
[WithFile(FlowerRgb101010Planar, PixelTypes.Rgba32)]

8
tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs

@ -71,17 +71,17 @@ public class ColorSpaceTransformUtilsTests
public void CollectColorBlueTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.AllowAll);
[Fact]
public void CollectColorBlueTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);
public void CollectColorBlueTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);
[Fact]
public void CollectColorBlueTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);
public void CollectColorBlueTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);
[Fact]
public void CollectColorRedTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.AllowAll);
[Fact]
public void CollectColorRedTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);
public void CollectColorRedTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);
[Fact]
public void CollectColorRedTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
public void CollectColorRedTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
}

3
tests/ImageSharp.Tests/TestImages.cs

@ -1122,6 +1122,7 @@ public static class TestImages
public const string Cmyk = "Tiff/Cmyk.tiff";
public const string Cmyk64BitDeflate = "Tiff/cmyk_deflate_64bit.tiff";
public const string CmykLzwPredictor = "Tiff/Cmyk-lzw-predictor.tiff";
public const string CmykJpeg = "Tiff/Cmyk-jpeg.tiff";
public const string Issues1716Rgb161616BitLittleEndian = "Tiff/Issues/Issue1716.tiff";
public const string Issues1891 = "Tiff/Issues/Issue1891.tiff";
@ -1129,6 +1130,8 @@ public static class TestImages
public const string Issues2149 = "Tiff/Issues/Group4CompressionWithStrips.tiff";
public const string Issues2255 = "Tiff/Issues/Issue2255.png";
public const string Issues2435 = "Tiff/Issues/Issue2435.tiff";
public const string Issues2454_A = "Tiff/Issues/Issue2454_A.tif";
public const string Issues2454_B = "Tiff/Issues/Issue2454_B.tif";
public const string Issues2587 = "Tiff/Issues/Issue2587.tiff";
public const string Issues2679 = "Tiff/Issues/Issue2679.tiff";
public const string JpegCompressedGray0000539558 = "Tiff/Issues/JpegCompressedGray-0000539558.tiff";

3
tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_Cmyk_Rgba32_Cmyk-jpeg.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7f68db78d765a7f36570cd7b57a1f06cfca24c3b4916d0692a4aa051209ec327
size 616

3
tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_A.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c4f77673028643af0ac02a8f6a1e2db14052177e3401c369391a8ff7e943770c
size 7679254

3
tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_B.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e616895c21fd8b19a216e8a3ef4968bd413589b5875efdac29860f019a710527
size 7517284

3
tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_A.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d7911e059049c427229136479740fd62e2e09907549ec3e1421a6a60da6167cc
size 7840892

3
tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_B.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:291f2033a7b4cfc10fb3301283c167b3fbc288bc173c95b21bc726bf076865af
size 7649213

3
tests/Images/Input/Tiff/Cmyk-jpeg.tiff

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:abb923e457acc31a7f18c46a7d58fc5a42f5c3d197236403921e3ee623fa4fac
size 2046

3
tests/Images/Input/Tiff/Cmyk-planar-jpg.tiff

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:abb923e457acc31a7f18c46a7d58fc5a42f5c3d197236403921e3ee623fa4fac
size 2046

3
tests/Images/Input/Tiff/Issues/Issue2454_A.tif

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:868fbf7fc7a61bc6b1226160c8dc3bb1faebd8d4a2a6fe9494962f3fbe3a7fdc
size 5024256

3
tests/Images/Input/Tiff/Issues/Issue2454_B.tif

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:867851192f540742ba1481f503834f8aa77caa03ac59f8204d098bf940b0bb3a
size 4387646
Loading…
Cancel
Save