Browse Source

Add sse2 version of Vp8Sse4X4

pull/1817/head
Brian Popow 5 years ago
parent
commit
8d19c2881d
  1. 59
      src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

59
src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

@ -4,11 +4,16 @@
using System; using System;
using System.Buffers.Binary; using System.Buffers.Binary;
using System.Runtime.CompilerServices; using System.Runtime.CompilerServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
// ReSharper disable InconsistentNaming // ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Formats.Webp.Lossy namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{ {
internal static class LossyUtils internal static unsafe class LossyUtils
{ {
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16); public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
@ -17,7 +22,57 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public static int Vp8Sse16X8(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 8); public static int Vp8Sse16X8(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 8);
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse4X4(Span<byte> a, Span<byte> b) => GetSse(a, b, 4, 4); public static int Vp8Sse4X4(Span<byte> a, Span<byte> b)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
#pragma warning disable SA1503 // Braces should not be omitted
Span<int> tmp = stackalloc int[4];
fixed (byte* aPtr = a)
fixed (byte* bPtr = b)
fixed (int* tmpPtr = tmp)
{
// Load values.
Vector128<byte> a0 = Sse2.LoadVector128(aPtr);
Vector128<byte> a1 = Sse2.LoadVector128(aPtr + WebpConstants.Bps);
Vector128<byte> a2 = Sse2.LoadVector128(aPtr + (WebpConstants.Bps * 2));
Vector128<byte> a3 = Sse2.LoadVector128(aPtr + (WebpConstants.Bps * 3));
Vector128<byte> b0 = Sse2.LoadVector128(bPtr);
Vector128<byte> b1 = Sse2.LoadVector128(bPtr + WebpConstants.Bps);
Vector128<byte> b2 = Sse2.LoadVector128(bPtr + (WebpConstants.Bps * 2));
Vector128<byte> b3 = Sse2.LoadVector128(bPtr + (WebpConstants.Bps * 3));
// Combine pair of lines.
Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());
// Convert to 16b.
Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);
// subtract, square and accumulate.
Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
Vector128<int> sum = Sse2.Add(e0, e1);
Sse2.Store(tmpPtr, sum);
return tmp[3] + tmp[2] + tmp[1] + tmp[0];
}
#pragma warning restore SA1503 // Braces should not be omitted
}
else
#endif
{
return GetSse(a, b, 4, 4);
}
}
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public static int GetSse(Span<byte> a, Span<byte> b, int w, int h) public static int GetSse(Span<byte> a, Span<byte> b, int w, int h)

Loading…
Cancel
Save