Browse Source

Used inline vectors if they are constants

pull/2122/head
Günther Foidl 4 years ago
parent
commit
dd35b743c5
  1. 5
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
  2. 26
      src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
  3. 92
      src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
  4. 84
      src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
  5. 147
      src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
  6. 38
      src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
  7. 101
      src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
  8. 6
      src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
  9. 10
      src/ImageSharp/Formats/Webp/Lossy/Vp8Residual.cs
  10. 89
      src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
  11. 43
      src/ImageSharp/Formats/Webp/WebpCommonUtils.cs

5
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs

@ -35,8 +35,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
[FieldOffset(224)]
public Vector256<float> V7;
private static readonly Vector256<int> MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
@ -45,6 +43,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
ref Vector256<float> bBase = ref b.V0;
ref Vector256<short> destRef = ref dest.V01;
Vector256<int> multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
for (nint i = 0; i < 8; i += 2)
{
@ -52,7 +51,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16();
row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16();
Unsafe.Add(ref destRef, (IntPtr)((uint)i / 2)) = row;
}

26
src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs

@ -9,18 +9,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
internal static partial class FastFloatingPointDCT
{
#pragma warning disable SA1310, SA1311, IDE1006 // naming rule violation warnings
private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f);
private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);
private static readonly Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
private static readonly Vector256<float> mm256_F_1_8477 = Vector256.Create(1.847759065f);
private static readonly Vector256<float> mm256_F_n1_0823 = Vector256.Create(-1.082392200f);
private static readonly Vector256<float> mm256_F_n2_6131 = Vector256.Create(-2.613125930f);
#pragma warning restore SA1310, SA1311, IDE1006
/// <summary>
/// Apply floating point FDCT inplace using simd operations.
/// </summary>
@ -57,6 +45,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
block.V0 = Avx.Add(tmp10, tmp11);
block.V4 = Avx.Subtract(tmp10, tmp11);
Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
block.V2 = Avx.Add(tmp13, z1);
block.V6 = Avx.Subtract(tmp13, z1);
@ -66,9 +55,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
tmp11 = Avx.Add(tmp5, tmp6);
tmp12 = Avx.Add(tmp6, tmp7);
Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), Vector256.Create(0.382683433f)); // mm256_F_0_3826
Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411
Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065
Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
Vector256<float> z11 = Avx.Add(tmp7, z3);
@ -109,6 +98,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
Vector256<float> tmp10 = Avx.Add(z5, tmp2);
Vector256<float> tmp11 = Avx.Subtract(z5, tmp2);
Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
@ -131,10 +121,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
tmp7 = Avx.Add(z11, z13);
tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);
z5 = Avx.Multiply(Avx.Add(z10, z12), mm256_F_1_8477);
z5 = Avx.Multiply(Avx.Add(z10, z12), Vector256.Create(1.847759065f)); // mm256_F_1_8477
tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, mm256_F_n1_0823);
tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, mm256_F_n2_6131);
tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823
tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131
tmp6 = Avx.Subtract(tmp12, tmp7);
tmp5 = Avx.Subtract(tmp11, tmp6);

92
src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs

@ -13,36 +13,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
internal static class ColorSpaceTransformUtils
{
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector128<byte> CollectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte();
private static readonly Vector128<byte> CollectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte();
private static readonly Vector256<byte> CollectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte();
private static readonly Vector256<byte> CollectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte();
private static readonly Vector128<byte> CollectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
private static readonly Vector128<byte> CollectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
private static readonly Vector128<byte> CollectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
private static readonly Vector128<byte> CollectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
private static readonly Vector128<byte> CollectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
private static readonly Vector256<byte> CollectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
private static readonly Vector256<byte> CollectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
private static readonly Vector256<byte> CollectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
private static readonly Vector256<byte> CollectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
private static readonly Vector256<byte> CollectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
#endif
public static void CollectColorBlueTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
{
#if SUPPORTS_RUNTIME_INTRINSICS
@ -50,8 +20,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
const int span = 16;
Span<ushort> values = stackalloc ushort[span];
var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
Vector256<byte> collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
Vector256<byte> collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
Vector256<byte> collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
Vector256<byte> collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector256<byte> collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
Vector256<short> multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
for (int y = 0; y < tileHeight; y++)
{
Span<uint> srcSpan = bgra.Slice(y * stride);
@ -62,18 +37,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
nint input1Idx = x + (span / 2);
Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector256<byte> r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256);
Vector256<byte> r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256);
Vector256<byte> r0 = Avx2.Shuffle(input0, collectColorBlueTransformsShuffleLowMask256);
Vector256<byte> r1 = Avx2.Shuffle(input1, collectColorBlueTransformsShuffleHighMask256);
Vector256<byte> r = Avx2.Or(r0, r1);
Vector256<byte> gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256);
Vector256<byte> gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256);
Vector256<byte> gb0 = Avx2.And(input0, collectColorBlueTransformsGreenBlueMask256);
Vector256<byte> gb1 = Avx2.And(input1, collectColorBlueTransformsGreenBlueMask256);
Vector256<ushort> gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector256<byte> g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256);
Vector256<byte> g = Avx2.And(gb.AsByte(), collectColorBlueTransformsGreenMask256);
Vector256<short> a = Avx2.MultiplyHigh(r.AsInt16(), multsr);
Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg);
Vector256<byte> c = Avx2.Subtract(gb.AsByte(), b.AsByte());
Vector256<byte> d = Avx2.Subtract(c, a.AsByte());
Vector256<byte> e = Avx2.And(d, CollectColorBlueTransformsBlueMask256);
Vector256<byte> e = Avx2.And(d, collectColorBlueTransformsBlueMask256);
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = e.AsUInt16();
@ -95,8 +70,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
const int span = 8;
Span<ushort> values = stackalloc ushort[span];
var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
Vector128<byte> collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
Vector128<byte> collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
Vector128<byte> collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
Vector128<byte> collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector128<byte> collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
Vector128<short> multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
for (int y = 0; y < tileHeight; y++)
{
Span<uint> srcSpan = bgra.Slice(y * stride);
@ -107,18 +87,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
nint input1Idx = x + (span / 2);
Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector128<byte> r0 = Ssse3.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask);
Vector128<byte> r1 = Ssse3.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask);
Vector128<byte> r0 = Ssse3.Shuffle(input0, collectColorBlueTransformsShuffleLowMask);
Vector128<byte> r1 = Ssse3.Shuffle(input1, collectColorBlueTransformsShuffleHighMask);
Vector128<byte> r = Sse2.Or(r0, r1);
Vector128<byte> gb0 = Sse2.And(input0, CollectColorBlueTransformsGreenBlueMask);
Vector128<byte> gb1 = Sse2.And(input1, CollectColorBlueTransformsGreenBlueMask);
Vector128<byte> gb0 = Sse2.And(input0, collectColorBlueTransformsGreenBlueMask);
Vector128<byte> gb1 = Sse2.And(input1, collectColorBlueTransformsGreenBlueMask);
Vector128<ushort> gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
Vector128<byte> g = Sse2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask);
Vector128<byte> g = Sse2.And(gb.AsByte(), collectColorBlueTransformsGreenMask);
Vector128<short> a = Sse2.MultiplyHigh(r.AsInt16(), multsr);
Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg);
Vector128<byte> c = Sse2.Subtract(gb.AsByte(), b.AsByte());
Vector128<byte> d = Sse2.Subtract(c, a.AsByte());
Vector128<byte> e = Sse2.And(d, CollectColorBlueTransformsBlueMask);
Vector128<byte> e = Sse2.And(d, collectColorBlueTransformsBlueMask);
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = e.AsUInt16();
@ -163,7 +143,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported && tileWidth >= 16)
{
var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
Vector256<byte> collectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte();
Vector256<byte> collectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte();
Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
const int span = 16;
Span<ushort> values = stackalloc ushort[span];
for (int y = 0; y < tileHeight; y++)
@ -176,15 +158,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
nint input1Idx = x + (span / 2);
Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector256<byte> g0 = Avx2.And(input0, CollectColorRedTransformsGreenMask256); // 0 0 | g 0
Vector256<byte> g1 = Avx2.And(input1, CollectColorRedTransformsGreenMask256);
Vector256<byte> g0 = Avx2.And(input0, collectColorRedTransformsGreenMask256); // 0 0 | g 0
Vector256<byte> g1 = Avx2.And(input1, collectColorRedTransformsGreenMask256);
Vector256<ushort> g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector256<int> a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector256<int> a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16);
Vector256<ushort> a = Avx2.PackUnsignedSaturate(a0, a1); // x r
Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector256<byte> c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r'
Vector256<byte> d = Avx2.And(c, CollectColorRedTransformsAndMask256); // 0 r'
Vector256<byte> d = Avx2.And(c, collectColorRedTransformsAndMask256); // 0 r'
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = d.AsUInt16();
@ -204,7 +186,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
else if (Sse41.IsSupported)
{
var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
Vector128<byte> collectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte();
Vector128<byte> collectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte();
Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
const int span = 8;
Span<ushort> values = stackalloc ushort[span];
for (int y = 0; y < tileHeight; y++)
@ -217,15 +201,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
nint input1Idx = x + (span / 2);
Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
Vector128<byte> g0 = Sse2.And(input0, CollectColorRedTransformsGreenMask); // 0 0 | g 0
Vector128<byte> g1 = Sse2.And(input1, CollectColorRedTransformsGreenMask);
Vector128<byte> g0 = Sse2.And(input0, collectColorRedTransformsGreenMask); // 0 0 | g 0
Vector128<byte> g1 = Sse2.And(input1, collectColorRedTransformsGreenMask);
Vector128<ushort> g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
Vector128<int> a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
Vector128<int> a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16);
Vector128<ushort> a = Sse41.PackUnsignedSaturate(a0, a1); // x r
Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr
Vector128<byte> c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r'
Vector128<byte> d = Sse2.And(c, CollectColorRedTransformsAndMask); // 0 r'
Vector128<byte> d = Sse2.And(c, collectColorRedTransformsAndMask); // 0 r'
ref ushort outputRef = ref MemoryMarshal.GetReference(values);
Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = d.AsUInt16();

84
src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs

@ -27,36 +27,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
private const double Log2Reciprocal = 1.44269504088896338700465094007086;
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector256<byte> AddGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
private static readonly Vector128<byte> AddGreenToBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
private static readonly byte AddGreenToBlueAndRedShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
private static readonly Vector256<byte> SubtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
private static readonly Vector128<byte> SubtractGreenFromBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
private static readonly byte SubtractGreenFromBlueAndRedShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
private static readonly Vector128<byte> TransformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
private static readonly Vector256<byte> TransformColorAlphaGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
private static readonly Vector128<byte> TransformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
private static readonly Vector256<byte> TransformColorRedBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
private static readonly byte TransformColorShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
private static readonly Vector128<byte> TransformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
private static readonly Vector256<byte> TransformColorInverseAlphaGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
private static readonly byte TransformColorInverseShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
#endif
/// <summary>
/// Returns the exact index where array1 and array2 are different. For an index
/// inferior or equal to bestLenMatch, the return value just has to be strictly
@ -129,13 +99,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported)
{
Vector256<byte> addGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
int numPixels = pixelData.Length;
nint i;
for (i = 0; i <= numPixels - 8; i += 8)
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
Vector256<byte> in0g0g = Avx2.Shuffle(input, AddGreenToBlueAndRedMaskAvx2);
Vector256<byte> in0g0g = Avx2.Shuffle(input, addGreenToBlueAndRedMaskAvx2);
Vector256<byte> output = Avx2.Add(input, in0g0g);
Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
}
@ -147,13 +118,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
else if (Ssse3.IsSupported)
{
Vector128<byte> addGreenToBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
int numPixels = pixelData.Length;
nint i;
for (i = 0; i <= numPixels - 4; i += 4)
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<byte> in0g0g = Ssse3.Shuffle(input, AddGreenToBlueAndRedMaskSsse3);
Vector128<byte> in0g0g = Ssse3.Shuffle(input, addGreenToBlueAndRedMaskSsse3);
Vector128<byte> output = Sse2.Add(input, in0g0g);
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
}
@ -172,8 +144,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
Vector128<ushort> b = Sse2.ShuffleLow(a, AddGreenToBlueAndRedShuffleMask);
Vector128<ushort> c = Sse2.ShuffleHigh(b, AddGreenToBlueAndRedShuffleMask); // 0g0g
Vector128<ushort> b = Sse2.ShuffleLow(a, 0xA0); // MmShuffle(2, 2, 0, 0)
Vector128<ushort> c = Sse2.ShuffleHigh(b, 0xA0); // MmShuffle(2, 2, 0, 0) 0g0g
Vector128<byte> output = Sse2.Add(input.AsByte(), c.AsByte());
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
}
@ -209,13 +181,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported)
{
Vector256<byte> subtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
int numPixels = pixelData.Length;
nint i;
for (i = 0; i <= numPixels - 8; i += 8)
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
Vector256<byte> in0g0g = Avx2.Shuffle(input, SubtractGreenFromBlueAndRedMaskAvx2);
Vector256<byte> in0g0g = Avx2.Shuffle(input, subtractGreenFromBlueAndRedMaskAvx2);
Vector256<byte> output = Avx2.Subtract(input, in0g0g);
Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
}
@ -227,13 +200,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
else if (Ssse3.IsSupported)
{
Vector128<byte> subtractGreenFromBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
int numPixels = pixelData.Length;
nint i;
for (i = 0; i <= numPixels - 4; i += 4)
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<byte> in0g0g = Ssse3.Shuffle(input, SubtractGreenFromBlueAndRedMaskSsse3);
Vector128<byte> in0g0g = Ssse3.Shuffle(input, subtractGreenFromBlueAndRedMaskSsse3);
Vector128<byte> output = Sse2.Subtract(input, in0g0g);
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
}
@ -252,8 +226,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
Vector128<ushort> b = Sse2.ShuffleLow(a, SubtractGreenFromBlueAndRedShuffleMask);
Vector128<ushort> c = Sse2.ShuffleHigh(b, SubtractGreenFromBlueAndRedShuffleMask); // 0g0g
Vector128<ushort> b = Sse2.ShuffleLow(a, 0xA0); // MmShuffle(2, 2, 0, 0)
Vector128<ushort> c = Sse2.ShuffleHigh(b, 0xA0); // MmShuffle(2, 2, 0, 0) 0g0g
Vector128<byte> output = Sse2.Subtract(input.AsByte(), c.AsByte());
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
}
@ -400,6 +374,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported && numPixels >= 8)
{
Vector256<byte> transformColorAlphaGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector256<byte> transformColorRedBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
Vector256<int> multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
Vector256<int> multsb2 = MkCst32(Cst5b(m.RedToBlue), 0);
@ -408,15 +384,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
Vector256<uint> input = Unsafe.As<uint, Vector256<uint>>(ref pos);
Vector256<byte> a = Avx2.And(input.AsByte(), TransformColorAlphaGreenMask256);
Vector256<short> b = Avx2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask);
Vector256<short> c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask);
Vector256<byte> a = Avx2.And(input.AsByte(), transformColorAlphaGreenMask256);
Vector256<short> b = Avx2.ShuffleLow(a.AsInt16(), 0xA0); // MmShuffle(2, 2, 0, 0)
Vector256<short> c = Avx2.ShuffleHigh(b.AsInt16(), 0xA0); // MmShuffle(2, 2, 0, 0)
Vector256<short> d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector256<short> e = Avx2.ShiftLeftLogical(input.AsInt16(), 8);
Vector256<short> f = Avx2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
Vector256<int> g = Avx2.ShiftRightLogical(f.AsInt32(), 16);
Vector256<byte> h = Avx2.Add(g.AsByte(), d.AsByte());
Vector256<byte> i = Avx2.And(h, TransformColorRedBlueMask256);
Vector256<byte> i = Avx2.And(h, transformColorRedBlueMask256);
Vector256<byte> output = Avx2.Subtract(input.AsByte(), i);
Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
}
@ -428,6 +404,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
else if (Sse2.IsSupported)
{
Vector128<byte> transformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector128<byte> transformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
Vector128<int> multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
Vector128<int> multsb2 = MkCst16(Cst5b(m.RedToBlue), 0);
nint idx;
@ -435,15 +413,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
Vector128<byte> a = Sse2.And(input.AsByte(), TransformColorAlphaGreenMask);
Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask);
Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask);
Vector128<byte> a = Sse2.And(input.AsByte(), transformColorAlphaGreenMask);
Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), 0xA0); // MmShuffle(2, 2, 0, 0)
Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), 0xA0); // MmShuffle(2, 2, 0, 0)
Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector128<short> e = Sse2.ShiftLeftLogical(input.AsInt16(), 8);
Vector128<short> f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
Vector128<int> g = Sse2.ShiftRightLogical(f.AsInt32(), 16);
Vector128<byte> h = Sse2.Add(g.AsByte(), d.AsByte());
Vector128<byte> i = Sse2.And(h, TransformColorRedBlueMask);
Vector128<byte> i = Sse2.And(h, transformColorRedBlueMask);
Vector128<byte> output = Sse2.Subtract(input.AsByte(), i);
Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
}
@ -488,6 +466,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported && pixelData.Length >= 8)
{
Vector256<byte> transformColorInverseAlphaGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector256<int> multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
Vector256<int> multsb2 = MkCst32(Cst5b(m.RedToBlue), 0);
nint idx;
@ -495,9 +474,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
Vector256<uint> input = Unsafe.As<uint, Vector256<uint>>(ref pos);
Vector256<byte> a = Avx2.And(input.AsByte(), TransformColorInverseAlphaGreenMask256);
Vector256<short> b = Avx2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask);
Vector256<short> c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask);
Vector256<byte> a = Avx2.And(input.AsByte(), transformColorInverseAlphaGreenMask256);
Vector256<short> b = Avx2.ShuffleLow(a.AsInt16(), 0xA0); // MmShuffle(2, 2, 0, 0)
Vector256<short> c = Avx2.ShuffleHigh(b.AsInt16(), 0xA0); // MmShuffle(2, 2, 0, 0)
Vector256<short> d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector256<byte> e = Avx2.Add(input.AsByte(), d.AsByte());
Vector256<short> f = Avx2.ShiftLeftLogical(e.AsInt16(), 8);
@ -516,6 +495,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
else if (Sse2.IsSupported)
{
Vector128<byte> transformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
Vector128<int> multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
Vector128<int> multsb2 = MkCst16(Cst5b(m.RedToBlue), 0);
@ -524,9 +504,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
Vector128<byte> a = Sse2.And(input.AsByte(), TransformColorInverseAlphaGreenMask);
Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask);
Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask);
Vector128<byte> a = Sse2.And(input.AsByte(), transformColorInverseAlphaGreenMask);
Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), 0xA0); // MmShuffle(2, 2, 0, 0)
Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), 0xA0); // MmShuffle(2, 2, 0, 0)
Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
Vector128<byte> e = Sse2.Add(input.AsByte(), d.AsByte());
Vector128<short> f = Sse2.ShiftLeftLogical(e.AsInt16(), 8);

147
src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

@ -15,29 +15,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
internal static class LossyUtils
{
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
private static readonly Vector128<byte> SignBit = Vector128.Create((byte)0x80);
private static readonly Vector128<sbyte> Three = Vector128.Create((byte)3).AsSByte();
private static readonly Vector128<short> FourShort = Vector128.Create((short)4);
private static readonly Vector128<sbyte> FourSByte = Vector128.Create((byte)4).AsSByte();
private static readonly Vector128<sbyte> Nine = Vector128.Create((short)0x0900).AsSByte();
private static readonly Vector128<sbyte> SixtyThree = Vector128.Create((short)63).AsSByte();
private static readonly Vector128<sbyte> SixtyFour = Vector128.Create((byte)64).AsSByte();
private static readonly Vector128<short> K1 = Vector128.Create((short)20091);
private static readonly Vector128<short> K2 = Vector128.Create((short)-30068);
#endif
// Note: method name in libwebp reference implementation is called VP8SSE16x16.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b)
@ -1025,16 +1002,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
Vector128<short> k1 = Vector128.Create((short)20091);
Vector128<short> k2 = Vector128.Create((short)-30068);
// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2);
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1);
Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
Vector128<short> c4 = Sse2.Subtract(c1, c2);
Vector128<short> c = Sse2.Add(c3.AsInt16(), c4);
// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1);
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2);
Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
Vector128<short> d4 = Sse2.Add(d1, d2);
Vector128<short> d = Sse2.Add(d3, d4);
@ -1050,20 +1030,20 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// Horizontal pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
Vector128<short> dc = Sse2.Add(t0.AsInt16(), FourShort);
Vector128<short> dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4));
a = Sse2.Add(dc, t2.AsInt16());
b = Sse2.Subtract(dc, t2.AsInt16());
// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2);
c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1);
c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
c4 = Sse2.Subtract(c1, c2);
c = Sse2.Add(c3, c4);
// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1);
d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2);
d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
d4 = Sse2.Add(d1, d2);
d = Sse2.Add(d3, d4);
@ -1146,16 +1126,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
Vector128<short> k1 = Vector128.Create((short)20091);
Vector128<short> k2 = Vector128.Create((short)-30068);
// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2);
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1);
Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
Vector128<short> c4 = Sse2.Subtract(c1, c2);
Vector128<short> c = Sse2.Add(c3.AsInt16(), c4);
// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1);
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2);
Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
Vector128<short> d4 = Sse2.Add(d1, d2);
Vector128<short> d = Sse2.Add(d3, d4);
@ -1171,20 +1154,20 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// Horizontal pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
Vector128<short> dc = Sse2.Add(t0.AsInt16(), FourShort);
Vector128<short> dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4));
a = Sse2.Add(dc, t2.AsInt16());
b = Sse2.Subtract(dc, t2.AsInt16());
// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2);
c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1);
c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
c4 = Sse2.Subtract(c1, c2);
c = Sse2.Add(c3, c4);
// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1);
d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2);
d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
d4 = Sse2.Add(d1, d2);
d = Sse2.Add(d3, d4);
@ -1810,6 +1793,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
#if SUPPORTS_RUNTIME_INTRINSICS
if (Ssse3.IsSupported)
{
Vector128<byte> mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input));
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16)));
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16)));
@ -1818,10 +1803,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8);
Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8);
Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8);
Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte
Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask);
Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask);
Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask);
Vector128<byte> c0 = Sse2.And(a0, mean16x4Mask); // lo byte
Vector128<byte> c1 = Sse2.And(a1, mean16x4Mask);
Vector128<byte> c2 = Sse2.And(a2, mean16x4Mask);
Vector128<byte> c3 = Sse2.And(a3, mean16x4Mask);
Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32());
Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32());
Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32());
@ -1978,14 +1963,16 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// Applies filter on 2 pixels (p0 and q0)
private static void DoFilter2Sse2(ref Vector128<byte> p1, ref Vector128<byte> p0, ref Vector128<byte> q0, ref Vector128<byte> q1, int thresh)
{
Vector128<byte> signBit = Vector128.Create((byte)0x80);
// Convert p1/q1 to byte (for GetBaseDelta).
Vector128<byte> p1s = Sse2.Xor(p1, SignBit);
Vector128<byte> q1s = Sse2.Xor(q1, SignBit);
Vector128<byte> p1s = Sse2.Xor(p1, signBit);
Vector128<byte> q1s = Sse2.Xor(q1, signBit);
Vector128<byte> mask = NeedsFilter(p1, p0, q0, q1, thresh);
// Flip sign.
p0 = Sse2.Xor(p0, SignBit);
q0 = Sse2.Xor(q0, SignBit);
p0 = Sse2.Xor(p0, signBit);
q0 = Sse2.Xor(q0, signBit);
Vector128<byte> a = GetBaseDelta(p1s.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1s.AsSByte()).AsByte();
@ -1995,8 +1982,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
DoSimpleFilterSse2(ref p0, ref q0, a);
// Flip sign.
p0 = Sse2.Xor(p0, SignBit);
q0 = Sse2.Xor(q0, SignBit);
p0 = Sse2.Xor(p0, signBit);
q0 = Sse2.Xor(q0, signBit);
}
// Applies filter on 4 pixels (p1, p0, q0 and q1)
@ -2005,11 +1992,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// Compute hev mask.
Vector128<byte> notHev = GetNotHev(ref p1, ref p0, ref q0, ref q1, tresh);
Vector128<byte> signBit = Vector128.Create((byte)0x80);
// Convert to signed values.
p1 = Sse2.Xor(p1, SignBit);
p0 = Sse2.Xor(p0, SignBit);
q0 = Sse2.Xor(q0, SignBit);
q1 = Sse2.Xor(q1, SignBit);
p1 = Sse2.Xor(p1, signBit);
p0 = Sse2.Xor(p0, signBit);
q0 = Sse2.Xor(q0, signBit);
q1 = Sse2.Xor(q1, signBit);
Vector128<sbyte> t1 = Sse2.SubtractSaturate(p1.AsSByte(), q1.AsSByte()); // p1 - q1
t1 = Sse2.AndNot(notHev, t1.AsByte()).AsSByte(); // hev(p1 - q1)
@ -2019,25 +2008,25 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0)
t1 = Sse2.And(t1.AsByte(), mask).AsSByte(); // mask filter values we don't care about.
t2 = Sse2.AddSaturate(t1, Three); // 3 * (q0 - p0) + hev(p1 - q1) + 3
Vector128<sbyte> t3 = Sse2.AddSaturate(t1, FourSByte); // 3 * (q0 - p0) + hev(p1 - q1) + 4
t2 = Sse2.AddSaturate(t1, Vector128.Create((byte)3).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 3
Vector128<sbyte> t3 = Sse2.AddSaturate(t1, Vector128.Create((byte)4).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 4
t2 = SignedShift8b(t2.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
t3 = SignedShift8b(t3.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
p0 = Sse2.AddSaturate(p0.AsSByte(), t2).AsByte(); // p0 += t2
q0 = Sse2.SubtractSaturate(q0.AsSByte(), t3).AsByte(); // q0 -= t3
p0 = Sse2.Xor(p0, SignBit);
q0 = Sse2.Xor(q0, SignBit);
p0 = Sse2.Xor(p0, signBit);
q0 = Sse2.Xor(q0, signBit);
// This is equivalent to signed (a + 1) >> 1 calculation.
t2 = Sse2.Add(t3, SignBit.AsSByte());
t2 = Sse2.Add(t3, signBit.AsSByte());
t3 = Sse2.Average(t2.AsByte(), Vector128<byte>.Zero).AsSByte();
t3 = Sse2.Subtract(t3, SixtyFour);
t3 = Sse2.Subtract(t3, Vector128.Create((sbyte)64));
t3 = Sse2.And(notHev, t3.AsByte()).AsSByte(); // if !hev
q1 = Sse2.SubtractSaturate(q1.AsSByte(), t3).AsByte(); // q1 -= t3
p1 = Sse2.AddSaturate(p1.AsSByte(), t3).AsByte(); // p1 += t3
p1 = Sse2.Xor(p1.AsByte(), SignBit);
q1 = Sse2.Xor(q1.AsByte(), SignBit);
p1 = Sse2.Xor(p1.AsByte(), signBit);
q1 = Sse2.Xor(q1.AsByte(), signBit);
}
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
@ -2047,12 +2036,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector128<byte> notHev = GetNotHev(ref p1, ref p0, ref q0, ref q1, tresh);
// Convert to signed values.
p1 = Sse2.Xor(p1, SignBit);
p0 = Sse2.Xor(p0, SignBit);
q0 = Sse2.Xor(q0, SignBit);
q1 = Sse2.Xor(q1, SignBit);
p2 = Sse2.Xor(p2, SignBit);
q2 = Sse2.Xor(q2, SignBit);
Vector128<byte> signBit = Vector128.Create((byte)0x80);
p1 = Sse2.Xor(p1, signBit);
p0 = Sse2.Xor(p0, signBit);
q0 = Sse2.Xor(q0, signBit);
q1 = Sse2.Xor(q1, signBit);
p2 = Sse2.Xor(p2, signBit);
q2 = Sse2.Xor(q2, signBit);
Vector128<sbyte> a = GetBaseDelta(p1.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1.AsSByte());
@ -2067,11 +2057,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector128<byte> flow = Sse2.UnpackLow(Vector128<byte>.Zero, f);
Vector128<byte> fhigh = Sse2.UnpackHigh(Vector128<byte>.Zero, f);
Vector128<short> f9Low = Sse2.MultiplyHigh(flow.AsInt16(), Nine.AsInt16()); // Filter (lo) * 9
Vector128<short> f9High = Sse2.MultiplyHigh(fhigh.AsInt16(), Nine.AsInt16()); // Filter (hi) * 9
Vector128<short> nine = Vector128.Create((short)0x0900);
Vector128<short> f9Low = Sse2.MultiplyHigh(flow.AsInt16(), nine); // Filter (lo) * 9
Vector128<short> f9High = Sse2.MultiplyHigh(fhigh.AsInt16(), nine); // Filter (hi) * 9
Vector128<short> a2Low = Sse2.Add(f9Low, SixtyThree.AsInt16()); // Filter * 9 + 63
Vector128<short> a2High = Sse2.Add(f9High, SixtyThree.AsInt16()); // Filter * 9 + 63
Vector128<short> sixtyThree = Vector128.Create((short)63);
Vector128<short> a2Low = Sse2.Add(f9Low, sixtyThree); // Filter * 9 + 63
Vector128<short> a2High = Sse2.Add(f9High, sixtyThree); // Filter * 9 + 63
Vector128<short> a1Low = Sse2.Add(a2Low, f9Low); // Filter * 18 + 63
Vector128<short> a1High = Sse2.Add(a2High, f9High); // // Filter * 18 + 63
@ -2086,8 +2078,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
private static void DoSimpleFilterSse2(ref Vector128<byte> p0, ref Vector128<byte> q0, Vector128<byte> fl)
{
Vector128<sbyte> v3 = Sse2.AddSaturate(fl.AsSByte(), Three);
Vector128<sbyte> v4 = Sse2.AddSaturate(fl.AsSByte(), FourSByte);
Vector128<sbyte> v3 = Sse2.AddSaturate(fl.AsSByte(), Vector128.Create((byte)3).AsSByte());
Vector128<sbyte> v4 = Sse2.AddSaturate(fl.AsSByte(), Vector128.Create((byte)4).AsSByte());
v4 = SignedShift8b(v4.AsByte()).AsSByte(); // v4 >> 3
v3 = SignedShift8b(v3.AsByte()).AsSByte(); // v3 >> 3
@ -2353,13 +2345,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
private static void Update2Pixels(ref Vector128<byte> pi, ref Vector128<byte> qi, Vector128<short> a0Low, Vector128<short> a0High)
{
Vector128<byte> signBit = Vector128.Create((byte)0x80);
Vector128<short> a1Low = Sse2.ShiftRightArithmetic(a0Low, 7);
Vector128<short> a1High = Sse2.ShiftRightArithmetic(a0High, 7);
Vector128<sbyte> delta = Sse2.PackSignedSaturate(a1Low, a1High);
pi = Sse2.AddSaturate(pi.AsSByte(), delta).AsByte();
qi = Sse2.SubtractSaturate(qi.AsSByte(), delta).AsByte();
pi = Sse2.Xor(pi, SignBit.AsByte());
qi = Sse2.Xor(qi, SignBit.AsByte());
pi = Sse2.Xor(pi, signBit.AsByte());
qi = Sse2.Xor(qi, signBit.AsByte());
}
[MethodImpl(InliningOptions.ShortMethod)]

38
src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs

@ -20,24 +20,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
private const int MaxLevel = 2047;
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector128<short> MaxCoeff2047 = Vector128.Create((short)MaxLevel);
private static readonly Vector256<short> MaxCoeff2047Vec256 = Vector256.Create((short)MaxLevel);
private static readonly Vector256<byte> Cst256 = Vector256.Create(0, 1, 2, 3, 8, 9, 254, 255, 10, 11, 4, 5, 6, 7, 12, 13, 2, 3, 8, 9, 10, 11, 4, 5, 254, 255, 6, 7, 12, 13, 14, 15);
private static readonly Vector256<byte> Cst78 = Vector256.Create(254, 255, 254, 255, 254, 255, 254, 255, 14, 15, 254, 255, 254, 255, 254, 255, 254, 255, 254, 255, 254, 255, 0, 1, 254, 255, 254, 255, 254, 255, 254, 255);
private static readonly Vector128<byte> CstLo = Vector128.Create(0, 1, 2, 3, 8, 9, 254, 255, 10, 11, 4, 5, 6, 7, 12, 13);
private static readonly Vector128<byte> Cst7 = Vector128.Create(254, 255, 254, 255, 254, 255, 254, 255, 14, 15, 254, 255, 254, 255, 254, 255);
private static readonly Vector128<byte> CstHi = Vector128.Create(2, 3, 8, 9, 10, 11, 4, 5, 254, 255, 6, 7, 12, 13, 14, 15);
private static readonly Vector128<byte> Cst8 = Vector128.Create(254, 255, 254, 255, 254, 255, 0, 1, 254, 255, 254, 255, 254, 255, 254, 255);
#endif
// Diffusion weights. We under-correct a bit (15/16th of the error is actually
// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
private const int C1 = 7; // fraction of error sent to the 4x4 block below
@ -574,8 +556,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector256<short> out0 = Avx2.PackSignedSaturate(out00.AsInt32(), out08.AsInt32());
// if (coeff > 2047) coeff = 2047
out0 = Avx2.Min(out0, MaxCoeff2047Vec256);
out0 = Avx2.Min(out0, Vector256.Create((short)MaxLevel));
// Put the sign back.
out0 = Avx2.Sign(out0, input0);
@ -585,8 +566,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Unsafe.As<short, Vector256<short>>(ref inputRef) = input0;
// zigzag the output before storing it.
Vector256<byte> tmp256 = Avx2.Shuffle(out0.AsByte(), Cst256);
Vector256<byte> tmp78 = Avx2.Shuffle(out0.AsByte(), Cst78);
Vector256<byte> tmp256 = Avx2.Shuffle(out0.AsByte(), Vector256.Create(0, 1, 2, 3, 8, 9, 254, 255, 10, 11, 4, 5, 6, 7, 12, 13, 2, 3, 8, 9, 10, 11, 4, 5, 254, 255, 6, 7, 12, 13, 14, 15)); // Cst256
Vector256<byte> tmp78 = Avx2.Shuffle(out0.AsByte(), Vector256.Create(254, 255, 254, 255, 254, 255, 254, 255, 14, 15, 254, 255, 254, 255, 254, 255, 254, 255, 254, 255, 254, 255, 0, 1, 254, 255, 254, 255, 254, 255, 254, 255)); // Cst78
// Reverse the order of the 16-byte lanes.
Vector256<byte> tmp87 = Avx2.Permute2x128(tmp78, tmp78, 1);
@ -654,8 +635,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector128<short> out8 = Sse2.PackSignedSaturate(out08.AsInt32(), out12.AsInt32());
// if (coeff > 2047) coeff = 2047
out0 = Sse2.Min(out0, MaxCoeff2047);
out8 = Sse2.Min(out8, MaxCoeff2047);
Vector128<short> maxCoeff2047 = Vector128.Create((short)MaxLevel);
out0 = Sse2.Min(out0, maxCoeff2047);
out8 = Sse2.Min(out8, maxCoeff2047);
// Put the sign back.
out0 = Ssse3.Sign(out0, input0);
@ -676,10 +658,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// There's only two misplaced entries ([8] and [7]) that are crossing the
// reg's boundaries.
// We use pshufb instead of pshuflo/pshufhi.
Vector128<byte> tmpLo = Ssse3.Shuffle(out0.AsByte(), CstLo);
Vector128<byte> tmp7 = Ssse3.Shuffle(out0.AsByte(), Cst7); // extract #7
Vector128<byte> tmpHi = Ssse3.Shuffle(out8.AsByte(), CstHi);
Vector128<byte> tmp8 = Ssse3.Shuffle(out8.AsByte(), Cst8); // extract #8
Vector128<byte> tmpLo = Ssse3.Shuffle(out0.AsByte(), Vector128.Create(0, 1, 2, 3, 8, 9, 254, 255, 10, 11, 4, 5, 6, 7, 12, 13));
Vector128<byte> tmp7 = Ssse3.Shuffle(out0.AsByte(), Vector128.Create(254, 255, 254, 255, 254, 255, 254, 255, 14, 15, 254, 255, 254, 255, 254, 255)); // extract #7
Vector128<byte> tmpHi = Ssse3.Shuffle(out8.AsByte(), Vector128.Create(2, 3, 8, 9, 10, 11, 4, 5, 254, 255, 6, 7, 12, 13, 14, 15));
Vector128<byte> tmp8 = Ssse3.Shuffle(out8.AsByte(), Vector128.Create(254, 255, 254, 255, 254, 255, 0, 1, 254, 255, 254, 255, 254, 255, 254, 255)); // extract #8
Vector128<byte> outZ0 = Sse2.Or(tmpLo, tmp8);
Vector128<byte> outZ8 = Sse2.Or(tmpHi, tmp7);

101
src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs

@ -21,7 +21,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
private const int KC2 = 35468;
private static readonly byte[] Clip1 = new byte[255 + 510 + 1]; // clips [-255,510] to [0,255]
private static readonly byte[] Clip1 = GetClip1(); // clips [-255,510] to [0,255]
private const int I16DC16 = 0 * 16 * WebpConstants.Bps;
@ -65,48 +65,16 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 };
#if SUPPORTS_RUNTIME_INTRINSICS
#pragma warning disable SA1310 // Field names should not contain underscore
private static readonly Vector128<short> K1 = Vector128.Create((short)20091).AsInt16();
private static readonly Vector128<short> K2 = Vector128.Create((short)-30068).AsInt16();
private static readonly Vector128<short> Four = Vector128.Create((short)4);
private static readonly Vector128<short> Seven = Vector128.Create((short)7);
private static readonly Vector128<short> K88p = Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16();
private static readonly Vector128<short> K88m = Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16();
private static readonly Vector128<short> K5352_2217p = Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16();
private static readonly Vector128<short> K5352_2217m = Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16();
private static readonly Vector128<int> K937 = Vector128.Create(937);
private static readonly Vector128<int> K1812 = Vector128.Create(1812);
private static readonly Vector128<short> K5352_2217 = Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16();
private static readonly Vector128<short> K2217_5352 = Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16();
private static readonly Vector128<int> K12000PlusOne = Vector128.Create(12000 + (1 << 16));
private static readonly Vector128<int> K51000 = Vector128.Create(51000);
private static readonly byte MmShuffle2301 = SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1);
private static readonly byte MmShuffle1032 = SimdUtils.Shuffle.MmShuffle(1, 0, 3, 2);
#pragma warning restore SA1310 // Field names should not contain underscore
#endif
static Vp8Encoding()
private static byte[] GetClip1()
{
byte[] clip1 = new byte[255 + 510 + 1];
for (int i = -255; i <= 255 + 255; i++)
{
Clip1[255 + i] = Clip8b(i);
clip1[255 + i] = Clip8b(i);
}
return clip1;
}
// Transforms (Paragraph 14.4)
@ -347,16 +315,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
Vector128<short> k1 = Vector128.Create((short)20091).AsInt16();
Vector128<short> k2 = Vector128.Create((short)-30068).AsInt16();
// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2);
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1);
Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
Vector128<short> c4 = Sse2.Subtract(c1, c2);
Vector128<short> c = Sse2.Add(c3, c4);
// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1);
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2);
Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
Vector128<short> d4 = Sse2.Add(d1, d2);
Vector128<short> d = Sse2.Add(d3, d4);
@ -370,20 +341,23 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
private static void InverseTransformHorizontalPass(Vector128<long> t0, Vector128<long> t2, Vector128<long> t1, Vector128<long> t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3)
{
Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
Vector128<short> dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4));
Vector128<short> a = Sse2.Add(dc, t2.AsInt16());
Vector128<short> b = Sse2.Subtract(dc, t2.AsInt16());
Vector128<short> k1 = Vector128.Create((short)20091).AsInt16();
Vector128<short> k2 = Vector128.Create((short)-30068).AsInt16();
// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
Vector128<short> c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
Vector128<short> c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
Vector128<short> c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2);
Vector128<short> c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1);
Vector128<short> c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
Vector128<short> c4 = Sse2.Subtract(c1, c2);
Vector128<short> c = Sse2.Add(c3, c4);
// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
Vector128<short> d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
Vector128<short> d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
Vector128<short> d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1);
Vector128<short> d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2);
Vector128<short> d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
Vector128<short> d4 = Sse2.Add(d1, d2);
Vector128<short> d = Sse2.Add(d3, d4);
@ -561,8 +535,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
// *in01 = 00 01 10 11 02 03 12 13
// *in23 = 20 21 30 31 22 23 32 33
Vector128<short> shuf01_p = Sse2.ShuffleHigh(row01, MmShuffle2301);
Vector128<short> shuf32_p = Sse2.ShuffleHigh(row23, MmShuffle2301);
Vector128<short> shuf01_p = Sse2.ShuffleHigh(row01, 0xB1); // MmShuffle(2, 3, 0, 1)
Vector128<short> shuf32_p = Sse2.ShuffleHigh(row23, 0xB1); // MmShuffle(2, 3, 0, 1)
// 00 01 10 11 03 02 13 12
// 20 21 30 31 23 22 33 32
@ -576,12 +550,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
// [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
Vector128<int> tmp0 = Sse2.MultiplyAddAdjacent(a01, K88p); // [ (a0 + a1) << 3, ... ]
Vector128<int> tmp2 = Sse2.MultiplyAddAdjacent(a01, K88m); // [ (a0 - a1) << 3, ... ]
Vector128<int> tmp11 = Sse2.MultiplyAddAdjacent(a32, K5352_2217p);
Vector128<int> tmp31 = Sse2.MultiplyAddAdjacent(a32, K5352_2217m);
Vector128<int> tmp12 = Sse2.Add(tmp11, K1812);
Vector128<int> tmp32 = Sse2.Add(tmp31, K937);
// [ (a0 + a1) << 3, ... ]
Vector128<int> tmp0 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p
// [ (a0 - a1) << 3, ... ]
Vector128<int> tmp2 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16()); // K88m
Vector128<int> tmp11 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16()); // K5352_2217p
Vector128<int> tmp31 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16()); // K5352_2217m
Vector128<int> tmp12 = Sse2.Add(tmp11, Vector128.Create(1812));
Vector128<int> tmp32 = Sse2.Add(tmp31, Vector128.Create(937));
Vector128<int> tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9);
Vector128<int> tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9);
Vector128<short> s03 = Sse2.PackSignedSaturate(tmp0, tmp2);
@ -590,7 +567,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector128<short> shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3
Vector128<int> v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32());
out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32());
out32 = Sse2.Shuffle(v23, MmShuffle1032);
out32 = Sse2.Shuffle(v23, 0x4E); // MmShuffle(1, 0, 3, 2)
}
public static void FTransformPass2SSE2(Vector128<int> v01, Vector128<int> v32, Span<short> output)
@ -602,10 +579,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector128<long> a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64());
Vector128<short> b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16());
Vector128<int> c1 = Sse2.MultiplyAddAdjacent(b23, K5352_2217);
Vector128<int> c3 = Sse2.MultiplyAddAdjacent(b23, K2217_5352);
Vector128<int> d1 = Sse2.Add(c1, K12000PlusOne);
Vector128<int> d3 = Sse2.Add(c3, K51000);
Vector128<int> c1 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16()); // K5352_2217
Vector128<int> c3 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16()); // K2217_5352
Vector128<int> d1 = Sse2.Add(c1, Vector128.Create(12000 + (1 << 16))); // K12000PlusOne
Vector128<int> d3 = Sse2.Add(c3, Vector128.Create(51000));
Vector128<int> e1 = Sse2.ShiftRightArithmetic(d1, 16);
Vector128<int> e3 = Sse2.ShiftRightArithmetic(d3, 16);
@ -623,7 +600,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// a0 = v0 + v3
// a1 = v1 + v2
Vector128<short> a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16());
Vector128<short> a01Plus7 = Sse2.Add(a01.AsInt16(), Seven);
Vector128<short> a01Plus7 = Sse2.Add(a01.AsInt16(), Vector128.Create((short)7));
Vector128<short> a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
Vector128<short> c0 = Sse2.Add(a01Plus7, a11);
Vector128<short> c2 = Sse2.Subtract(a01Plus7, a11);

6
src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs

@ -24,10 +24,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
/// </summary>
private const int MaxCoeffThresh = 31;
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector256<short> MaxCoeffThreshVec = Vector256.Create((short)MaxCoeffThresh);
#endif
private int maxValue;
private int lastNonZero;
@ -73,7 +69,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector256<short> v0 = Avx2.ShiftRightArithmetic(abs0.AsInt16(), 3);
// bin = min(v, MAX_COEFF_THRESH)
Vector256<short> min0 = Avx2.Min(v0, MaxCoeffThreshVec);
Vector256<short> min0 = Avx2.Min(v0, Vector256.Create((short)MaxCoeffThresh));
// Store.
Unsafe.As<short, Vector256<short>>(ref outputRef) = min0;

10
src/ImageSharp/Formats/Webp/Lossy/Vp8Residual.cs

@ -16,12 +16,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
/// </summary>
internal class Vp8Residual
{
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector256<byte> Cst2 = Vector256.Create((byte)2);
private static readonly Vector256<byte> Cst67 = Vector256.Create((byte)67);
#endif
private readonly byte[] scratch = new byte[32];
private readonly ushort[] scratchUShort = new ushort[16];
@ -182,8 +176,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector256<short> d0 = Avx2.Subtract(Vector256<short>.Zero, c0);
Vector256<short> e0 = Avx2.Max(c0, d0); // abs(v), 16b
Vector256<sbyte> f = Avx2.PackSignedSaturate(e0, e0);
Vector256<byte> g = Avx2.Min(f.AsByte(), Cst2);
Vector256<byte> h = Avx2.Min(f.AsByte(), Cst67); // clampLevel in [0..67]
Vector256<byte> g = Avx2.Min(f.AsByte(), Vector256.Create((byte)2));
Vector256<byte> h = Avx2.Min(f.AsByte(), Vector256.Create((byte)67)); // clampLevel in [0..67]
ref byte ctxsRef = ref MemoryMarshal.GetReference(ctxs);
ref byte levelsRef = ref MemoryMarshal.GetReference(levels);

89
src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs

@ -23,49 +23,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
private const int YuvHalf = 1 << (YuvFix - 1);
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector128<byte> One = Vector128.Create((byte)1);
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
// R = (19077 * y + 26149 * v - 14234) >> 6
// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
// B = (19077 * y + 33050 * u - 17685) >> 6
private static readonly Vector128<byte> K19077 = Vector128.Create((short)19077).AsByte();
private static readonly Vector128<byte> K26149 = Vector128.Create((short)26149).AsByte();
private static readonly Vector128<byte> K14234 = Vector128.Create((short)14234).AsByte();
// 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
private static readonly Vector128<byte> K33050 = Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129);
private static readonly Vector128<byte> K17685 = Vector128.Create((short)17685).AsByte();
private static readonly Vector128<byte> K6419 = Vector128.Create((short)6419).AsByte();
private static readonly Vector128<byte> K13320 = Vector128.Create((short)13320).AsByte();
private static readonly Vector128<byte> K8708 = Vector128.Create((short)8708).AsByte();
private static readonly Vector128<byte> PlanarTo24Shuffle0 = Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5);
private static readonly Vector128<byte> PlanarTo24Shuffle1 = Vector128.Create(255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10, 255);
private static readonly Vector128<byte> PlanarTo24Shuffle2 = Vector128.Create(255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255, 255);
private static readonly Vector128<byte> PlanarTo24Shuffle3 = Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255);
private static readonly Vector128<byte> PlanarTo24Shuffle4 = Vector128.Create(5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10);
private static readonly Vector128<byte> PlanarTo24Shuffle5 = Vector128.Create(255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255);
private static readonly Vector128<byte> PlanarTo24Shuffle6 = Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255);
private static readonly Vector128<byte> PlanarTo24Shuffle7 = Vector128.Create(255, 5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255);
private static readonly Vector128<byte> PlanarTo24Shuffle8 = Vector128.Create(10, 255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15);
#endif
// UpSample from YUV to RGB.
// Given samples laid out in a square as:
// [a b]
@ -250,7 +207,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector128<byte> t1 = Sse2.Or(ad, bc); // (a^d) | (b^c)
Vector128<byte> t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t)
Vector128<byte> t3 = Sse2.And(t2, One); // (a^d) | (b^c) | (s^t) & 1
Vector128<byte> t3 = Sse2.And(t2, Vector128.Create((byte)1)); // (a^d) | (b^c) | (s^t) & 1
Vector128<byte> t4 = Sse2.Average(s, t);
Vector128<byte> k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4
@ -289,7 +246,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Vector128<byte> tmp1 = Sse2.And(ij, st); // (ij) & (s^t)
Vector128<byte> tmp2 = Sse2.Xor(k, input); // (k^in)
Vector128<byte> tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in)
Vector128<byte> tmp4 = Sse2.And(tmp3, One); // & 1 -> lsb_correction
Vector128<byte> tmp4 = Sse2.And(tmp3, Vector128.Create((byte)1)); // & 1 -> lsb_correction
return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction
}
@ -668,9 +625,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
ChannelMixing(
input0,
input1,
PlanarTo24Shuffle0,
PlanarTo24Shuffle1,
PlanarTo24Shuffle2,
Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5), // PlanarTo24Shuffle0
Vector128.Create(255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10, 255), // PlanarTo24Shuffle1
Vector128.Create(255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255, 255), // PlanarTo24Shuffle2
out Vector128<byte> r0,
out Vector128<byte> r1,
out Vector128<byte> r2,
@ -683,9 +640,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
ChannelMixing(
input2,
input3,
PlanarTo24Shuffle3,
PlanarTo24Shuffle4,
PlanarTo24Shuffle5,
Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255), // PlanarTo24Shuffle3
Vector128.Create(5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10), // PlanarTo24Shuffle4
Vector128.Create(255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255), // PlanarTo24Shuffle5
out Vector128<byte> g0,
out Vector128<byte> g1,
out Vector128<byte> g2,
@ -697,9 +654,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
ChannelMixing(
input4,
input5,
PlanarTo24Shuffle6,
PlanarTo24Shuffle7,
PlanarTo24Shuffle8,
Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255), // PlanarTo24Shuffle6
Vector128.Create(255, 5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255), // PlanarTo24Shuffle7
Vector128.Create(10, 255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15), // PlanarTo24Shuffle8
out Vector128<byte> b0,
out Vector128<byte> b1,
out Vector128<byte> b2,
@ -757,21 +714,29 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
u0 = Sse2.UnpackLow(Vector128<byte>.Zero, u0);
v0 = Sse2.UnpackLow(Vector128<byte>.Zero, v0);
Vector128<ushort> y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16());
Vector128<ushort> r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16());
Vector128<ushort> g0 = Sse2.MultiplyHigh(u0.AsUInt16(), K6419.AsUInt16());
Vector128<ushort> g1 = Sse2.MultiplyHigh(v0.AsUInt16(), K13320.AsUInt16());
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
// R = (19077 * y + 26149 * v - 14234) >> 6
// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
// B = (19077 * y + 33050 * u - 17685) >> 6
Vector128<ushort> k19077 = Vector128.Create((ushort)19077);
Vector128<ushort> k26149 = Vector128.Create((ushort)26149);
Vector128<ushort> k14234 = Vector128.Create((ushort)14234);
Vector128<ushort> y1 = Sse2.MultiplyHigh(y0.AsUInt16(), k19077);
Vector128<ushort> r0 = Sse2.MultiplyHigh(v0.AsUInt16(), k26149);
Vector128<ushort> g0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419));
Vector128<ushort> g1 = Sse2.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320));
Vector128<ushort> r1 = Sse2.Subtract(y1.AsUInt16(), K14234.AsUInt16());
Vector128<ushort> r1 = Sse2.Subtract(y1.AsUInt16(), k14234);
Vector128<ushort> r2 = Sse2.Add(r1, r0);
Vector128<ushort> g2 = Sse2.Add(y1.AsUInt16(), K8708.AsUInt16());
Vector128<ushort> g2 = Sse2.Add(y1.AsUInt16(), Vector128.Create((ushort)8708));
Vector128<ushort> g3 = Sse2.Add(g0, g1);
Vector128<ushort> g4 = Sse2.Subtract(g2, g3);
Vector128<ushort> b0 = Sse2.MultiplyHigh(u0.AsUInt16(), K33050.AsUInt16());
Vector128<ushort> b0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16());
Vector128<ushort> b1 = Sse2.AddSaturate(b0, y1);
Vector128<ushort> b2 = Sse2.SubtractSaturate(b1, K17685.AsUInt16());
Vector128<ushort> b2 = Sse2.SubtractSaturate(b1, Vector128.Create((ushort)17685));
// Use logical shift for B2, which can be larger than 32767.
r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]

43
src/ImageSharp/Formats/Webp/WebpCommonUtils.cs

@ -16,16 +16,6 @@ namespace SixLabors.ImageSharp.Formats.Webp
/// </summary>
internal static class WebpCommonUtils
{
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector256<byte> AlphaMaskVector256 = Vector256.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
private static readonly Vector256<byte> All0x80Vector256 = Vector256.Create((byte)0x80).AsByte();
private static readonly Vector128<byte> AlphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
private static readonly Vector128<byte> All0x80 = Vector128.Create((byte)0x80).AsByte();
#endif
/// <summary>
/// Checks if the pixel row is not opaque.
/// </summary>
@ -41,20 +31,23 @@ namespace SixLabors.ImageSharp.Formats.Webp
int length = (row.Length * 4) - 3;
fixed (byte* src = rowBytes)
{
Vector256<byte> alphaMaskVector256 = Vector256.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
Vector256<byte> all0x80Vector256 = Vector256.Create((byte)0x80).AsByte();
for (; i + 128 <= length; i += 128)
{
Vector256<byte> a0 = Avx.LoadVector256(src + i).AsByte();
Vector256<byte> a1 = Avx.LoadVector256(src + i + 32).AsByte();
Vector256<byte> a2 = Avx.LoadVector256(src + i + 64).AsByte();
Vector256<byte> a3 = Avx.LoadVector256(src + i + 96).AsByte();
Vector256<int> b0 = Avx2.And(a0, AlphaMaskVector256).AsInt32();
Vector256<int> b1 = Avx2.And(a1, AlphaMaskVector256).AsInt32();
Vector256<int> b2 = Avx2.And(a2, AlphaMaskVector256).AsInt32();
Vector256<int> b3 = Avx2.And(a3, AlphaMaskVector256).AsInt32();
Vector256<int> b0 = Avx2.And(a0, alphaMaskVector256).AsInt32();
Vector256<int> b1 = Avx2.And(a1, alphaMaskVector256).AsInt32();
Vector256<int> b2 = Avx2.And(a2, alphaMaskVector256).AsInt32();
Vector256<int> b3 = Avx2.And(a3, alphaMaskVector256).AsInt32();
Vector256<short> c0 = Avx2.PackSignedSaturate(b0, b1).AsInt16();
Vector256<short> c1 = Avx2.PackSignedSaturate(b2, b3).AsInt16();
Vector256<byte> d = Avx2.PackSignedSaturate(c0, c1).AsByte();
Vector256<byte> bits = Avx2.CompareEqual(d, All0x80Vector256);
Vector256<byte> bits = Avx2.CompareEqual(d, all0x80Vector256);
int mask = Avx2.MoveMask(bits);
if (mask != -1)
{
@ -137,18 +130,20 @@ namespace SixLabors.ImageSharp.Formats.Webp
#if SUPPORTS_RUNTIME_INTRINSICS
private static unsafe bool IsNoneOpaque64Bytes(byte* src, int i)
{
Vector128<byte> alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
Vector128<byte> a2 = Sse2.LoadVector128(src + i + 32).AsByte();
Vector128<byte> a3 = Sse2.LoadVector128(src + i + 48).AsByte();
Vector128<int> b0 = Sse2.And(a0, AlphaMask).AsInt32();
Vector128<int> b1 = Sse2.And(a1, AlphaMask).AsInt32();
Vector128<int> b2 = Sse2.And(a2, AlphaMask).AsInt32();
Vector128<int> b3 = Sse2.And(a3, AlphaMask).AsInt32();
Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
Vector128<int> b2 = Sse2.And(a2, alphaMask).AsInt32();
Vector128<int> b3 = Sse2.And(a3, alphaMask).AsInt32();
Vector128<short> c0 = Sse2.PackSignedSaturate(b0, b1).AsInt16();
Vector128<short> c1 = Sse2.PackSignedSaturate(b2, b3).AsInt16();
Vector128<byte> d = Sse2.PackSignedSaturate(c0, c1).AsByte();
Vector128<byte> bits = Sse2.CompareEqual(d, All0x80);
Vector128<byte> bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte());
int mask = Sse2.MoveMask(bits);
if (mask != 0xFFFF)
{
@ -160,13 +155,15 @@ namespace SixLabors.ImageSharp.Formats.Webp
private static unsafe bool IsNoneOpaque32Bytes(byte* src, int i)
{
Vector128<byte> alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);
Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
Vector128<int> b0 = Sse2.And(a0, AlphaMask).AsInt32();
Vector128<int> b1 = Sse2.And(a1, AlphaMask).AsInt32();
Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
Vector128<short> c = Sse2.PackSignedSaturate(b0, b1).AsInt16();
Vector128<byte> d = Sse2.PackSignedSaturate(c, c).AsByte();
Vector128<byte> bits = Sse2.CompareEqual(d, All0x80);
Vector128<byte> bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte());
int mask = Sse2.MoveMask(bits);
if (mask != 0xFFFF)
{

Loading…
Cancel
Save