|
|
|
@ -43,6 +43,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b) |
|
|
|
{ |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
if (Avx2.IsSupported) |
|
|
|
{ |
|
|
|
return Vp8_Sse16xN_Avx2(a, b, 4); |
|
|
|
} |
|
|
|
|
|
|
|
if (Sse2.IsSupported) |
|
|
|
{ |
|
|
|
return Vp8_Sse16xN_Sse2(a, b, 8); |
|
|
|
@ -58,6 +63,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b) |
|
|
|
{ |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
if (Avx2.IsSupported) |
|
|
|
{ |
|
|
|
return Vp8_Sse16xN_Avx2(a, b, 2); |
|
|
|
} |
|
|
|
|
|
|
|
if (Sse2.IsSupported) |
|
|
|
{ |
|
|
|
return Vp8_Sse16xN_Sse2(a, b, 4); |
|
|
|
@ -194,6 +204,39 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
return Numerics.ReduceSum(sum); |
|
|
|
} |
|
|
|
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
private static int Vp8_Sse16xN_Avx2(Span<byte> a, Span<byte> b, int numPairs) |
|
|
|
{ |
|
|
|
Vector256<int> sum = Vector256<int>.Zero; |
|
|
|
int offset = 0; |
|
|
|
for (int i = 0; i < numPairs; i++) |
|
|
|
{ |
|
|
|
// Load values.
|
|
|
|
ref byte aRef = ref MemoryMarshal.GetReference(a); |
|
|
|
ref byte bRef = ref MemoryMarshal.GetReference(b); |
|
|
|
var a0 = Vector256.Create( |
|
|
|
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset)), |
|
|
|
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps))); |
|
|
|
var b0 = Vector256.Create( |
|
|
|
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset)), |
|
|
|
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps))); |
|
|
|
var a1 = Vector256.Create( |
|
|
|
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + (2 * WebpConstants.Bps))), |
|
|
|
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + (3 * WebpConstants.Bps)))); |
|
|
|
var b1 = Vector256.Create( |
|
|
|
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + (2 * WebpConstants.Bps))), |
|
|
|
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + (3 * WebpConstants.Bps)))); |
|
|
|
|
|
|
|
Vector256<int> sum1 = SubtractAndAccumulate(a0, b0); |
|
|
|
Vector256<int> sum2 = SubtractAndAccumulate(a1, b1); |
|
|
|
sum = Avx2.Add(sum, Avx2.Add(sum1, sum2)); |
|
|
|
|
|
|
|
offset += 4 * WebpConstants.Bps; |
|
|
|
} |
|
|
|
|
|
|
|
return Numerics.ReduceSum(sum); |
|
|
|
} |
|
|
|
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
private static Vector128<int> SubtractAndAccumulate(Vector128<byte> a, Vector128<byte> b) |
|
|
|
{ |
|
|
|
@ -212,6 +255,25 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
|
|
|
|
return Sse2.Add(sum1, sum2); |
|
|
|
} |
|
|
|
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
private static Vector256<int> SubtractAndAccumulate(Vector256<byte> a, Vector256<byte> b) |
|
|
|
{ |
|
|
|
// Take abs(a-b) in 8b.
|
|
|
|
Vector256<byte> ab = Avx2.SubtractSaturate(a, b); |
|
|
|
Vector256<byte> ba = Avx2.SubtractSaturate(b, a); |
|
|
|
Vector256<byte> absAb = Avx2.Or(ab, ba); |
|
|
|
|
|
|
|
// Zero-extend to 16b.
|
|
|
|
Vector256<byte> c0 = Avx2.UnpackLow(absAb, Vector256<byte>.Zero); |
|
|
|
Vector256<byte> c1 = Avx2.UnpackHigh(absAb, Vector256<byte>.Zero); |
|
|
|
|
|
|
|
// Multiply with self.
|
|
|
|
Vector256<int> sum1 = Avx2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); |
|
|
|
Vector256<int> sum2 = Avx2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); |
|
|
|
|
|
|
|
return Avx2.Add(sum1, sum2); |
|
|
|
} |
|
|
|
#endif
|
|
|
|
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
|