|
|
|
@ -1258,11 +1258,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
if (Sse2.IsSupported) |
|
|
|
{ |
|
|
|
// Beginning of p1
|
|
|
|
p = p.Slice(offset - 2); |
|
|
|
ref byte pRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), offset - 2); |
|
|
|
|
|
|
|
Load16x4(p, p.Slice(8 * stride), stride, out Vector128<byte> p1, out Vector128<byte> p0, out Vector128<byte> q0, out Vector128<byte> q1); |
|
|
|
Load16x4(ref pRef, ref Unsafe.Add(ref pRef, 8 * stride), stride, out Vector128<byte> p1, out Vector128<byte> p0, out Vector128<byte> q0, out Vector128<byte> q1); |
|
|
|
DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh); |
|
|
|
Store16x4(p1, p0, q0, q1, p, p.Slice(8 * stride), stride); |
|
|
|
Store16x4(p1, p0, q0, q1, ref pRef, ref Unsafe.Add(ref pRef, 8 * stride), stride); |
|
|
|
} |
|
|
|
else |
|
|
|
#endif
|
|
|
|
@ -1374,14 +1374,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
if (Sse2.IsSupported) |
|
|
|
{ |
|
|
|
Span<byte> b = p.Slice(offset - 4); |
|
|
|
Load16x4(b, b.Slice(8 * stride), stride, out Vector128<byte> p3, out Vector128<byte> p2, out Vector128<byte> p1, out Vector128<byte> p0); |
|
|
|
ref byte pRef = ref MemoryMarshal.GetReference(p); |
|
|
|
ref byte bRef = ref Unsafe.Add(ref pRef, offset - 4); |
|
|
|
Load16x4(ref bRef, ref Unsafe.Add(ref bRef, 8 * stride), stride, out Vector128<byte> p3, out Vector128<byte> p2, out Vector128<byte> p1, out Vector128<byte> p0); |
|
|
|
|
|
|
|
Vector128<byte> mask = Abs(p1, p0); |
|
|
|
mask = Sse2.Max(mask, Abs(p3, p2)); |
|
|
|
mask = Sse2.Max(mask, Abs(p2, p1)); |
|
|
|
|
|
|
|
Load16x4(p.Slice(offset), p.Slice(offset + (8 * stride)), stride, out Vector128<byte> q0, out Vector128<byte> q1, out Vector128<byte> q2, out Vector128<byte> q3); |
|
|
|
Load16x4(ref Unsafe.Add(ref pRef, offset), ref Unsafe.Add(ref pRef, offset + (8 * stride)), stride, out Vector128<byte> q0, out Vector128<byte> q1, out Vector128<byte> q2, out Vector128<byte> q3); |
|
|
|
|
|
|
|
mask = Sse2.Max(mask, Abs(q1, q0)); |
|
|
|
mask = Sse2.Max(mask, Abs(q3, q2)); |
|
|
|
@ -1390,8 +1391,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); |
|
|
|
DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); |
|
|
|
|
|
|
|
Store16x4(p3, p2, p1, p0, b, b.Slice(8 * stride), stride); |
|
|
|
Store16x4(q0, q1, q2, q3, p.Slice(offset), p.Slice(offset + (8 * stride)), stride); |
|
|
|
Store16x4(p3, p2, p1, p0, ref bRef, ref Unsafe.Add(ref bRef, 8 * stride), stride); |
|
|
|
Store16x4(q0, q1, q2, q3, ref Unsafe.Add(ref pRef, offset), ref Unsafe.Add(ref pRef, offset + (8 * stride)), stride); |
|
|
|
} |
|
|
|
else |
|
|
|
#endif
|
|
|
|
@ -1463,13 +1464,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
if (Sse2.IsSupported) |
|
|
|
{ |
|
|
|
Load16x4(p.Slice(offset), p.Slice(offset + (8 * stride)), stride, out Vector128<byte> p3, out Vector128<byte> p2, out Vector128<byte> p1, out Vector128<byte> p0); |
|
|
|
ref byte pRef = ref MemoryMarshal.GetReference(p); |
|
|
|
Load16x4(ref Unsafe.Add(ref pRef, offset), ref Unsafe.Add(ref pRef, offset + (8 * stride)), stride, out Vector128<byte> p3, out Vector128<byte> p2, out Vector128<byte> p1, out Vector128<byte> p0); |
|
|
|
|
|
|
|
Vector128<byte> mask; |
|
|
|
for (int k = 3; k > 0; k--) |
|
|
|
{ |
|
|
|
// Beginning of p1.
|
|
|
|
Span<byte> b = p.Slice(offset + 2); |
|
|
|
ref byte bRef = ref Unsafe.Add(ref pRef, offset + 2); |
|
|
|
|
|
|
|
// Beginning of q0 (and next span).
|
|
|
|
offset += 4; |
|
|
|
@ -1479,7 +1481,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
mask = Sse2.Max(mask, Abs(p3, p2)); |
|
|
|
mask = Sse2.Max(mask, Abs(p2, p1)); |
|
|
|
|
|
|
|
Load16x4(p.Slice(offset), p.Slice(offset + (8 * stride)), stride, out p3, out p2, out Vector128<byte> tmp1, out Vector128<byte> tmp2); |
|
|
|
Load16x4(ref Unsafe.Add(ref pRef, offset), ref Unsafe.Add(ref pRef, offset + (8 * stride)), stride, out p3, out p2, out Vector128<byte> tmp1, out Vector128<byte> tmp2); |
|
|
|
|
|
|
|
mask = Sse2.Max(mask, Abs(tmp1, tmp2)); |
|
|
|
mask = Sse2.Max(mask, Abs(p3, p2)); |
|
|
|
@ -1488,7 +1490,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
ComplexMask(p1, p0, p3, p2, thresh, ithresh, ref mask); |
|
|
|
DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); |
|
|
|
|
|
|
|
Store16x4(p1, p0, p3, p2, b, b.Slice(8 * stride), stride); |
|
|
|
Store16x4(p1, p0, p3, p2, ref bRef, ref Unsafe.Add(ref bRef, 8 * stride), stride); |
|
|
|
|
|
|
|
// Rotate samples.
|
|
|
|
p1 = tmp1; |
|
|
|
@ -1559,15 +1561,15 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
if (Sse2.IsSupported) |
|
|
|
{ |
|
|
|
Span<byte> tu = u.Slice(offset - 4); |
|
|
|
Span<byte> tv = v.Slice(offset - 4); |
|
|
|
Load16x4(tu, tv, stride, out Vector128<byte> p3, out Vector128<byte> p2, out Vector128<byte> p1, out Vector128<byte> p0); |
|
|
|
ref byte uRef = ref MemoryMarshal.GetReference(u); |
|
|
|
ref byte vRef = ref MemoryMarshal.GetReference(v); |
|
|
|
Load16x4(ref Unsafe.Add(ref uRef, offset - 4), ref Unsafe.Add(ref vRef, offset - 4), stride, out Vector128<byte> p3, out Vector128<byte> p2, out Vector128<byte> p1, out Vector128<byte> p0); |
|
|
|
|
|
|
|
Vector128<byte> mask = Abs(p1, p0); |
|
|
|
mask = Sse2.Max(mask, Abs(p3, p2)); |
|
|
|
mask = Sse2.Max(mask, Abs(p2, p1)); |
|
|
|
|
|
|
|
Load16x4(u.Slice(offset), v.Slice(offset), stride, out Vector128<byte> q0, out Vector128<byte> q1, out Vector128<byte> q2, out Vector128<byte> q3); |
|
|
|
Load16x4(ref Unsafe.Add(ref uRef, offset), ref Unsafe.Add(ref vRef, offset), stride, out Vector128<byte> q0, out Vector128<byte> q1, out Vector128<byte> q2, out Vector128<byte> q3); |
|
|
|
|
|
|
|
mask = Sse2.Max(mask, Abs(q1, q0)); |
|
|
|
mask = Sse2.Max(mask, Abs(q3, q2)); |
|
|
|
@ -1576,8 +1578,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); |
|
|
|
DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); |
|
|
|
|
|
|
|
Store16x4(p3, p2, p1, p0, tu, tv, stride); |
|
|
|
Store16x4(q0, q1, q2, q3, u.Slice(offset), v.Slice(offset), stride); |
|
|
|
Store16x4(p3, p2, p1, p0, ref Unsafe.Add(ref uRef, offset - 4), ref Unsafe.Add(ref vRef, offset - 4), stride); |
|
|
|
Store16x4(q0, q1, q2, q3, ref Unsafe.Add(ref uRef, offset), ref Unsafe.Add(ref vRef, offset), stride); |
|
|
|
} |
|
|
|
else |
|
|
|
#endif
|
|
|
|
@ -1640,7 +1642,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
if (Sse2.IsSupported) |
|
|
|
{ |
|
|
|
Load16x4(u.Slice(offset), v.Slice(offset), stride, out Vector128<byte> t2, out Vector128<byte> t1, out Vector128<byte> p1, out Vector128<byte> p0); |
|
|
|
ref byte uRef = ref MemoryMarshal.GetReference(u); |
|
|
|
ref byte vRef = ref MemoryMarshal.GetReference(v); |
|
|
|
Load16x4(ref Unsafe.Add(ref uRef, offset), ref Unsafe.Add(ref vRef, offset), stride, out Vector128<byte> t2, out Vector128<byte> t1, out Vector128<byte> p1, out Vector128<byte> p0); |
|
|
|
|
|
|
|
Vector128<byte> mask = Abs(p1, p0); |
|
|
|
mask = Sse2.Max(mask, Abs(t2, t1)); |
|
|
|
@ -1649,7 +1653,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
// Beginning of q0.
|
|
|
|
offset += 4; |
|
|
|
|
|
|
|
Load16x4(u.Slice(offset), v.Slice(offset), stride, out Vector128<byte> q0, out Vector128<byte> q1, out t1, out t2); |
|
|
|
Load16x4(ref Unsafe.Add(ref uRef, offset), ref Unsafe.Add(ref vRef, offset), stride, out Vector128<byte> q0, out Vector128<byte> q1, out t1, out t2); |
|
|
|
|
|
|
|
mask = Sse2.Max(mask, Abs(q1, q0)); |
|
|
|
mask = Sse2.Max(mask, Abs(t2, t1)); |
|
|
|
@ -1660,7 +1664,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
|
|
|
|
// Beginning of p1.
|
|
|
|
offset -= 2; |
|
|
|
Store16x4(p1, p0, q0, q1, u.Slice(offset), v.Slice(offset), stride); |
|
|
|
Store16x4(p1, p0, q0, q1, ref Unsafe.Add(ref uRef, offset), ref Unsafe.Add(ref vRef, offset), stride); |
|
|
|
} |
|
|
|
else |
|
|
|
#endif
|
|
|
|
@ -2072,7 +2076,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
return Sse2.CompareEqual(t7, Vector128<byte>.Zero); |
|
|
|
} |
|
|
|
|
|
|
|
private static void Load16x4(Span<byte> r0, Span<byte> r8, int stride, out Vector128<byte> p1, out Vector128<byte> p0, out Vector128<byte> q0, out Vector128<byte> q1) |
|
|
|
private static void Load16x4(ref byte r0, ref byte r8, int stride, out Vector128<byte> p1, out Vector128<byte> p0, out Vector128<byte> q0, out Vector128<byte> q1) |
|
|
|
{ |
|
|
|
// Assume the pixels around the edge (|) are numbered as follows
|
|
|
|
// 00 01 | 02 03
|
|
|
|
@ -2089,8 +2093,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
// q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
|
|
|
|
// p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
|
|
|
|
// q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
|
|
|
|
Load8x4(r0, stride, out Vector128<byte> t1, out Vector128<byte> t2); |
|
|
|
Load8x4(r8, stride, out p0, out q1); |
|
|
|
Load8x4(ref r0, stride, out Vector128<byte> t1, out Vector128<byte> t2); |
|
|
|
Load8x4(ref r8, stride, out p0, out q1); |
|
|
|
|
|
|
|
// p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
|
|
|
|
// p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
|
|
|
|
@ -2103,11 +2107,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
} |
|
|
|
|
|
|
|
// Reads 8 rows across a vertical edge.
|
|
|
|
private static void Load8x4(Span<byte> b, int stride, out Vector128<byte> p, out Vector128<byte> q) |
|
|
|
private static void Load8x4(ref byte bRef, int stride, out Vector128<byte> p, out Vector128<byte> q) |
|
|
|
{ |
|
|
|
// A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
|
|
|
|
// A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
|
|
|
|
ref byte bRef = ref MemoryMarshal.GetReference(b); |
|
|
|
uint a00 = Unsafe.As<byte, uint>(ref Unsafe.Add(ref bRef, 6 * stride)); |
|
|
|
uint a01 = Unsafe.As<byte, uint>(ref Unsafe.Add(ref bRef, 2 * stride)); |
|
|
|
uint a02 = Unsafe.As<byte, uint>(ref Unsafe.Add(ref bRef, 4 * stride)); |
|
|
|
@ -2136,7 +2139,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
} |
|
|
|
|
|
|
|
// Transpose back and store
|
|
|
|
private static void Store16x4(Vector128<byte> p1, Vector128<byte> p0, Vector128<byte> q0, Vector128<byte> q1, Span<byte> r0, Span<byte> r8, int stride) |
|
|
|
private static void Store16x4(Vector128<byte> p1, Vector128<byte> p0, Vector128<byte> q0, Vector128<byte> q1, ref byte r0Ref, ref byte r8Ref, int stride) |
|
|
|
{ |
|
|
|
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
|
|
|
|
// p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
|
|
|
|
@ -2160,17 +2163,16 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
p1s = Sse2.UnpackLow(t1.AsInt16(), q1s.AsInt16()).AsByte(); |
|
|
|
q1s = Sse2.UnpackHigh(t1.AsInt16(), q1s.AsInt16()).AsByte(); |
|
|
|
|
|
|
|
Store4x4(p0s, r0, stride); |
|
|
|
Store4x4(q0s, r0.Slice(4 * stride), stride); |
|
|
|
Store4x4(p0s, ref r0Ref, stride); |
|
|
|
Store4x4(q0s, ref Unsafe.Add(ref r0Ref, 4 * stride), stride); |
|
|
|
|
|
|
|
Store4x4(p1s, r8, stride); |
|
|
|
Store4x4(q1s, r8.Slice(4 * stride), stride); |
|
|
|
Store4x4(p1s, ref r8Ref, stride); |
|
|
|
Store4x4(q1s, ref Unsafe.Add(ref r8Ref, 4 * stride), stride); |
|
|
|
} |
|
|
|
|
|
|
|
private static void Store4x4(Vector128<byte> x, Span<byte> dst, int stride) |
|
|
|
private static void Store4x4(Vector128<byte> x, ref byte dstRef, int stride) |
|
|
|
{ |
|
|
|
int offset = 0; |
|
|
|
ref byte dstRef = ref MemoryMarshal.GetReference(dst); |
|
|
|
for (int i = 0; i < 4; i++) |
|
|
|
{ |
|
|
|
Unsafe.As<byte, int>(ref Unsafe.Add(ref dstRef, offset)) = Sse2.ConvertToInt32(x.AsInt32()); |
|
|
|
|