Browse Source

Add SSE2 version of SimpleHFilter16

pull/1871/head
Brian Popow 5 years ago
parent
commit
7cf715b401
  1. 131
      src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

131
src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

@ -967,13 +967,27 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public static void SimpleHFilter16(Span<byte> p, int offset, int stride, int thresh)
{
int thresh2 = (2 * thresh) + 1;
int end = offset + (16 * stride);
for (int i = offset; i < end; i += stride)
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
// beginning of p1
p = p.Slice(offset - 2);
Load16x4Sse2(p, p.Slice(8 * stride), stride, out Vector128<byte> p1, out Vector128<byte> p0, out Vector128<byte> q0, out Vector128<byte> q1);
DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh);
Store16x4Sse2(p1, p0, q0, q1, p, p.Slice(8 * stride), stride);
}
else
#endif
{
if (NeedsFilter(p, i, 1, thresh2))
int thresh2 = (2 * thresh) + 1;
int end = offset + (16 * stride);
for (int i = offset; i < end; i += stride)
{
DoFilter2(p, i, 1);
if (NeedsFilter(p, i, 1, thresh2))
{
DoFilter2(p, i, 1);
}
}
}
}
@ -1357,6 +1371,113 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
return Sse2.CompareEqual(t7, Vector128<byte>.Zero);
}
private static void Load16x4Sse2(Span<byte> r0, Span<byte> r8, int stride, out Vector128<byte> p1, out Vector128<byte> p0, out Vector128<byte> q0, out Vector128<byte> q1)
{
// Assume the pixels around the edge (|) are numbered as follows
// 00 01 | 02 03
// 10 11 | 12 13
// ... | ...
// e0 e1 | e2 e3
// f0 f1 | f2 f3
//
// r0 is pointing to the 0th row (00)
// r8 is pointing to the 8th row (80)
// Load
// p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
// q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
// p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
// q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
Load8x4Sse2(r0, stride, out Vector128<byte> t1, out Vector128<byte> t2);
Load8x4Sse2(r8, stride, out p0, out q1);
// p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
// p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
// q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
// q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
p1 = Sse2.UnpackLow(t1.AsInt64(), p0.AsInt64()).AsByte();
p0 = Sse2.UnpackHigh(t1.AsInt64(), p0.AsInt64()).AsByte();
q0 = Sse2.UnpackLow(t2.AsInt64(), q1.AsInt64()).AsByte();
q1 = Sse2.UnpackHigh(t2.AsInt64(), q1.AsInt64()).AsByte();
}
// Reads 8 rows across a vertical edge.
private static void Load8x4Sse2(Span<byte> b, int stride, out Vector128<byte> p, out Vector128<byte> q)
{
// A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
// A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
ref byte bRef = ref MemoryMarshal.GetReference(b);
uint a00 = Unsafe.As<byte, uint>(ref Unsafe.Add(ref bRef, 6 * stride));
uint a01 = Unsafe.As<byte, uint>(ref Unsafe.Add(ref bRef, 2 * stride));
uint a02 = Unsafe.As<byte, uint>(ref Unsafe.Add(ref bRef, 4 * stride));
uint a03 = Unsafe.As<byte, uint>(ref Unsafe.Add(ref bRef, 0 * stride));
Vector128<byte> a0 = Vector128.Create(a03, a02, a01, a00).AsByte();
uint a10 = Unsafe.As<byte, uint>(ref Unsafe.Add(ref bRef, 7 * stride));
uint a11 = Unsafe.As<byte, uint>(ref Unsafe.Add(ref bRef, 3 * stride));
uint a12 = Unsafe.As<byte, uint>(ref Unsafe.Add(ref bRef, 5 * stride));
uint a13 = Unsafe.As<byte, uint>(ref Unsafe.Add(ref bRef, 1 * stride));
Vector128<byte> a1 = Vector128.Create(a13, a12, a11, a10).AsByte();
// B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
// B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
Vector128<byte> b0 = Sse2.UnpackLow(a0, a1);
Vector128<byte> b1 = Sse2.UnpackHigh(a0, a1);
// C0 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
// C1 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
Vector128<short> c0 = Sse2.UnpackLow(b0.AsInt16(), b1.AsInt16());
Vector128<short> c1 = Sse2.UnpackHigh(b0.AsInt16(), b1.AsInt16());
// *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
// *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
p = Sse2.UnpackLow(c0.AsInt32(), c1.AsInt32()).AsByte();
q = Sse2.UnpackHigh(c0.AsInt32(), c1.AsInt32()).AsByte();
}
// Transpose back and store
private static void Store16x4Sse2(Vector128<byte> p1, Vector128<byte> p0, Vector128<byte> q0, Vector128<byte> q1, Span<byte> r0, Span<byte> r8, int stride)
{
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
// p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
Vector128<byte> p0s = Sse2.UnpackLow(p1, p0);
Vector128<byte> p1s = Sse2.UnpackHigh(p1, p0);
// q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
// q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
Vector128<byte> q0s = Sse2.UnpackLow(q0, q1);
Vector128<byte> q1s = Sse2.UnpackHigh(q0, q1);
// p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
// q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
Vector128<byte> t1 = p0s;
p0s = Sse2.UnpackLow(t1.AsInt16(), q0s.AsInt16()).AsByte();
q0s = Sse2.UnpackHigh(t1.AsInt16(), q0s.AsInt16()).AsByte();
// p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
// q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
t1 = p1s;
p1s = Sse2.UnpackLow(t1.AsInt16(), q1s.AsInt16()).AsByte();
q1s = Sse2.UnpackHigh(t1.AsInt16(), q1s.AsInt16()).AsByte();
Store4x4Sse2(p0s, r0, stride);
Store4x4Sse2(q0s, r0.Slice(4 * stride), stride);
Store4x4Sse2(p1s, r8, stride);
Store4x4Sse2(q1s, r8.Slice(4 * stride), stride);
}
private static void Store4x4Sse2(Vector128<byte> x, Span<byte> dst, int stride)
{
int offset = 0;
ref byte dstRef = ref MemoryMarshal.GetReference(dst);
for (int i = 0; i < 4; i++)
{
Unsafe.As<byte, int>(ref Unsafe.Add(ref dstRef, offset)) = Sse2.ConvertToInt32(x.AsInt32());
x = Sse2.ShiftRightLogical128BitLane(x, 4);
offset += stride;
}
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector128<sbyte> GetBaseDeltaSse2(Vector128<sbyte> p1, Vector128<sbyte> p0, Vector128<sbyte> q0, Vector128<sbyte> q1)
{

Loading…
Cancel
Save