Browse Source

Avoid pinning

pull/1810/head
Brian Popow 4 years ago
parent
commit
6e135cbd79
  1. 219
      src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

219
src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

@ -4,6 +4,7 @@
using System;
using System.Buffers.Binary;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
@ -614,120 +615,114 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
Span<int> sum = scratch.Slice(0, 4);
sum.Clear();
#pragma warning disable SA1503 // Braces should not be omitted
fixed (byte* inputAPtr = inputA)
fixed (byte* inputBPtr = inputB)
fixed (ushort* wPtr = w)
fixed (int* outputPtr = sum)
{
// Load and combine inputs.
Vector128<byte> ina0 = Sse2.LoadVector128(inputAPtr);
Vector128<byte> ina1 = Sse2.LoadVector128(inputAPtr + (WebpConstants.Bps * 1));
Vector128<byte> ina2 = Sse2.LoadVector128(inputAPtr + (WebpConstants.Bps * 2));
Vector128<long> ina3 = Sse2.LoadVector128((long*)(inputAPtr + (WebpConstants.Bps * 3)));
Vector128<byte> inb0 = Sse2.LoadVector128(inputBPtr);
Vector128<byte> inb1 = Sse2.LoadVector128(inputBPtr + (WebpConstants.Bps * 1));
Vector128<byte> inb2 = Sse2.LoadVector128(inputBPtr + (WebpConstants.Bps * 2));
Vector128<long> inb3 = Sse2.LoadVector128((long*)(inputBPtr + (WebpConstants.Bps * 3)));
// Combine inA and inB (we'll do two transforms in parallel).
Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Vertical pass first to avoid a transpose (vertical and horizontal passes
// are commutative because w/kWeightY is symmetric) and subsequent transpose.
// Calculate a and b (two 4x4 at once).
Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
Vector128<short> b0 = Sse2.Add(a0, a1);
Vector128<short> b1 = Sse2.Add(a3, a2);
Vector128<short> b2 = Sse2.Subtract(a3, a2);
Vector128<short> b3 = Sse2.Subtract(a0, a1);
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Transpose the two 4x4.
Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
// Horizontal pass and difference of weighted sums.
Vector128<ushort> w0 = Sse2.LoadVector128(wPtr);
Vector128<ushort> w8 = Sse2.LoadVector128(wPtr + 8);
// Calculate a and b (two 4x4 at once).
a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
b0 = Sse2.Add(a0, a1);
b1 = Sse2.Add(a3, a2);
b2 = Sse2.Subtract(a3, a2);
b3 = Sse2.Subtract(a0, a1);
// Separate the transforms of inA and inB.
Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
// weighted sums.
Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
// difference of weighted sums.
Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
Sse2.Store(outputPtr, result.AsInt32());
}
// Load and combine inputs.
Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
Vector128<byte> ina2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 2, 16)));
Vector128<long> ina3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
Vector128<byte> inb0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB));
Vector128<byte> inb1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps, 16)));
Vector128<byte> inb2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 2, 16)));
Vector128<long> inb3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
// Combine inA and inB (we'll do two transforms in parallel).
Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Vertical pass first to avoid a transpose (vertical and horizontal passes
// are commutative because w/kWeightY is symmetric) and subsequent transpose.
// Calculate a and b (two 4x4 at once).
Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
Vector128<short> b0 = Sse2.Add(a0, a1);
Vector128<short> b1 = Sse2.Add(a3, a2);
Vector128<short> b2 = Sse2.Subtract(a3, a2);
Vector128<short> b3 = Sse2.Subtract(a0, a1);
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Transpose the two 4x4.
Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
// Horizontal pass and difference of weighted sums.
Vector128<ushort> w0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w));
Vector128<ushort> w8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w.Slice(8, 8)));
// Calculate a and b (two 4x4 at once).
a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
b0 = Sse2.Add(a0, a1);
b1 = Sse2.Add(a3, a2);
b2 = Sse2.Subtract(a3, a2);
b3 = Sse2.Subtract(a0, a1);
// Separate the transforms of inA and inB.
Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
// weighted sums.
Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
// difference of weighted sums.
Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
ref int outputRef = ref MemoryMarshal.GetReference(sum);
Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
return sum[3] + sum[2] + sum[1] + sum[0];
#pragma warning restore SA1503 // Braces should not be omitted
}
#endif

Loading…
Cancel
Save