📷 A modern, cross-platform, 2D Graphics library for .NET
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

1334 lines
54 KiB

// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
using System.Buffers.Binary;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
internal static class LossyUtils
{
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
#endif
// Note: method name in libwebp reference implementation is called VP8SSE16x16.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 16);
// Note: method name in libwebp reference implementation is called VP8SSE16x8.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 8);
// Note: method name in libwebp reference implementation is called VP8SSE4x4.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported)
{
// Load values.
ref byte aRef = ref MemoryMarshal.GetReference(a);
ref byte bRef = ref MemoryMarshal.GetReference(b);
var a0 = Vector256.Create(
Unsafe.As<byte, Vector128<byte>>(ref aRef),
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps)));
var a1 = Vector256.Create(
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2)),
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3)));
var b0 = Vector256.Create(
Unsafe.As<byte, Vector128<byte>>(ref bRef),
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps)));
var b1 = Vector256.Create(
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2)),
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3)));
// Combine pair of lines.
Vector256<int> a01 = Avx2.UnpackLow(a0.AsInt32(), a1.AsInt32());
Vector256<int> b01 = Avx2.UnpackLow(b0.AsInt32(), b1.AsInt32());
// Convert to 16b.
Vector256<byte> a01s = Avx2.UnpackLow(a01.AsByte(), Vector256<byte>.Zero);
Vector256<byte> b01s = Avx2.UnpackLow(b01.AsByte(), Vector256<byte>.Zero);
// subtract, square and accumulate.
Vector256<byte> d0 = Avx2.SubtractSaturate(a01s, b01s);
Vector256<int> e0 = Avx2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
return Numerics.ReduceSum(e0);
}
if (Sse2.IsSupported)
{
// Load values.
ref byte aRef = ref MemoryMarshal.GetReference(a);
ref byte bRef = ref MemoryMarshal.GetReference(b);
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref aRef);
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps));
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2));
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3));
Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref bRef);
Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps));
Vector128<byte> b2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2));
Vector128<byte> b3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3));
// Combine pair of lines.
Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());
// Convert to 16b.
Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);
// subtract, square and accumulate.
Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
Vector128<int> sum = Sse2.Add(e0, e1);
return Numerics.ReduceSum(sum);
}
#endif
{
return Vp8_SseNxN(a, b, 4, 4);
}
}
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8_SseNxN(Span<byte> a, Span<byte> b, int w, int h)
{
int count = 0;
int aOffset = 0;
int bOffset = 0;
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
int diff = a[aOffset + x] - b[bOffset + x];
count += diff * diff;
}
aOffset += WebpConstants.Bps;
bOffset += WebpConstants.Bps;
}
return count;
}
[MethodImpl(InliningOptions.ShortMethod)]
public static void Vp8Copy4X4(Span<byte> src, Span<byte> dst) => Copy(src, dst, 4, 4);
[MethodImpl(InliningOptions.ShortMethod)]
public static void Vp8Copy16X8(Span<byte> src, Span<byte> dst) => Copy(src, dst, 16, 8);
[MethodImpl(InliningOptions.ShortMethod)]
public static void Copy(Span<byte> src, Span<byte> dst, int w, int h)
{
int offset = 0;
for (int y = 0; y < h; y++)
{
src.Slice(offset, w).CopyTo(dst.Slice(offset, w));
offset += WebpConstants.Bps;
}
}
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w, Span<int> scratch)
{
int d = 0;
int dataSize = (4 * WebpConstants.Bps) - 16;
for (int y = 0; y < 16 * WebpConstants.Bps; y += 4 * WebpConstants.Bps)
{
for (int x = 0; x < 16; x += 4)
{
d += Vp8Disto4X4(a.Slice(x + y, dataSize), b.Slice(x + y, dataSize), w, scratch);
}
}
return d;
}
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w, Span<int> scratch)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse41.IsSupported)
{
int diffSum = TTransformSse41(a, b, w);
return Math.Abs(diffSum) >> 5;
}
else
#endif
{
int sum1 = TTransform(a, w, scratch);
int sum2 = TTransform(b, w, scratch);
return Math.Abs(sum2 - sum1) >> 5;
}
}
public static void DC16(Span<byte> dst, Span<byte> yuv, int offset)
{
int offsetMinus1 = offset - 1;
int offsetMinusBps = offset - WebpConstants.Bps;
int dc = 16;
for (int j = 0; j < 16; j++)
{
// DC += dst[-1 + j * BPS] + dst[j - BPS];
dc += yuv[offsetMinus1 + (j * WebpConstants.Bps)] + yuv[offsetMinusBps + j];
}
Put16(dc >> 5, dst);
}
[MethodImpl(InliningOptions.ShortMethod)]
public static void TM16(Span<byte> dst, Span<byte> yuv, int offset) => TrueMotion(dst, yuv, offset, 16);
public static void VE16(Span<byte> dst, Span<byte> yuv, int offset)
{
// vertical
Span<byte> src = yuv.Slice(offset - WebpConstants.Bps, 16);
for (int j = 0; j < 16; j++)
{
// memcpy(dst + j * BPS, dst - BPS, 16);
src.CopyTo(dst.Slice(j * WebpConstants.Bps));
}
}
public static void HE16(Span<byte> dst, Span<byte> yuv, int offset)
{
// horizontal
offset--;
for (int j = 16; j > 0; j--)
{
// memset(dst, dst[-1], 16);
byte v = yuv[offset];
Memset(dst, v, 0, 16);
offset += WebpConstants.Bps;
dst = dst.Slice(WebpConstants.Bps);
}
}
public static void DC16NoTop(Span<byte> dst, Span<byte> yuv, int offset)
{
// DC with top samples not available.
int dc = 8;
for (int j = 0; j < 16; j++)
{
// DC += dst[-1 + j * BPS];
dc += yuv[-1 + (j * WebpConstants.Bps) + offset];
}
Put16(dc >> 4, dst);
}
public static void DC16NoLeft(Span<byte> dst, Span<byte> yuv, int offset)
{
// DC with left samples not available.
int dc = 8;
for (int i = 0; i < 16; i++)
{
// DC += dst[i - BPS];
dc += yuv[i - WebpConstants.Bps + offset];
}
Put16(dc >> 4, dst);
}
[MethodImpl(InliningOptions.ShortMethod)]
public static void DC16NoTopLeft(Span<byte> dst) =>
Put16(0x80, dst); // DC with no top and left samples.
public static void DC8uv(Span<byte> dst, Span<byte> yuv, int offset)
{
int dc0 = 8;
int offsetMinus1 = offset - 1;
int offsetMinusBps = offset - WebpConstants.Bps;
for (int i = 0; i < 8; i++)
{
// dc0 += dst[i - BPS] + dst[-1 + i * BPS];
dc0 += yuv[offsetMinusBps + i] + yuv[offsetMinus1 + (i * WebpConstants.Bps)];
}
Put8x8uv((byte)(dc0 >> 4), dst);
}
[MethodImpl(InliningOptions.ShortMethod)]
public static void TM8uv(Span<byte> dst, Span<byte> yuv, int offset) =>
TrueMotion(dst, yuv, offset, 8); // TrueMotion
public static void VE8uv(Span<byte> dst, Span<byte> yuv, int offset)
{
// vertical
Span<byte> src = yuv.Slice(offset - WebpConstants.Bps, 8);
int endIdx = 8 * WebpConstants.Bps;
for (int j = 0; j < endIdx; j += WebpConstants.Bps)
{
// memcpy(dst + j * BPS, dst - BPS, 8);
src.CopyTo(dst.Slice(j));
}
}
public static void HE8uv(Span<byte> dst, Span<byte> yuv, int offset)
{
// horizontal
offset--;
for (int j = 0; j < 8; j++)
{
// memset(dst, dst[-1], 8);
// dst += BPS;
byte v = yuv[offset];
Memset(dst, v, 0, 8);
dst = dst.Slice(WebpConstants.Bps);
offset += WebpConstants.Bps;
}
}
public static void DC8uvNoTop(Span<byte> dst, Span<byte> yuv, int offset)
{
// DC with no top samples.
int dc0 = 4;
int offsetMinusOne = offset - 1;
int endIdx = 8 * WebpConstants.Bps;
for (int i = 0; i < endIdx; i += WebpConstants.Bps)
{
// dc0 += dst[-1 + i * BPS];
dc0 += yuv[offsetMinusOne + i];
}
Put8x8uv((byte)(dc0 >> 3), dst);
}
public static void DC8uvNoLeft(Span<byte> dst, Span<byte> yuv, int offset)
{
// DC with no left samples.
int offsetMinusBps = offset - WebpConstants.Bps;
int dc0 = 4;
for (int i = 0; i < 8; i++)
{
// dc0 += dst[i - BPS];
dc0 += yuv[offsetMinusBps + i];
}
Put8x8uv((byte)(dc0 >> 3), dst);
}
[MethodImpl(InliningOptions.ShortMethod)]
public static void DC8uvNoTopLeft(Span<byte> dst) =>
Put8x8uv(0x80, dst); // DC with nothing.
public static void DC4(Span<byte> dst, Span<byte> yuv, int offset)
{
int dc = 4;
int offsetMinusBps = offset - WebpConstants.Bps;
int offsetMinusOne = offset - 1;
for (int i = 0; i < 4; i++)
{
dc += yuv[offsetMinusBps + i] + yuv[offsetMinusOne + (i * WebpConstants.Bps)];
}
dc >>= 3;
int endIndx = 4 * WebpConstants.Bps;
for (int i = 0; i < endIndx; i += WebpConstants.Bps)
{
Memset(dst, (byte)dc, i, 4);
}
}
[MethodImpl(InliningOptions.ShortMethod)]
public static void TM4(Span<byte> dst, Span<byte> yuv, int offset) => TrueMotion(dst, yuv, offset, 4);
public static void VE4(Span<byte> dst, Span<byte> yuv, int offset, Span<byte> vals)
{
// vertical
int topOffset = offset - WebpConstants.Bps;
vals[0] = Avg3(yuv[topOffset - 1], yuv[topOffset], yuv[topOffset + 1]);
vals[1] = Avg3(yuv[topOffset], yuv[topOffset + 1], yuv[topOffset + 2]);
vals[2] = Avg3(yuv[topOffset + 1], yuv[topOffset + 2], yuv[topOffset + 3]);
vals[3] = Avg3(yuv[topOffset + 2], yuv[topOffset + 3], yuv[topOffset + 4]);
int endIdx = 4 * WebpConstants.Bps;
for (int i = 0; i < endIdx; i += WebpConstants.Bps)
{
vals.CopyTo(dst.Slice(i));
}
}
public static void HE4(Span<byte> dst, Span<byte> yuv, int offset)
{
// horizontal
int offsetMinusOne = offset - 1;
byte a = yuv[offsetMinusOne - WebpConstants.Bps];
byte b = yuv[offsetMinusOne];
byte c = yuv[offsetMinusOne + WebpConstants.Bps];
byte d = yuv[offsetMinusOne + (2 * WebpConstants.Bps)];
byte e = yuv[offsetMinusOne + (3 * WebpConstants.Bps)];
uint val = 0x01010101U * Avg3(a, b, c);
BinaryPrimitives.WriteUInt32BigEndian(dst, val);
val = 0x01010101U * Avg3(b, c, d);
BinaryPrimitives.WriteUInt32BigEndian(dst.Slice(WebpConstants.Bps), val);
val = 0x01010101U * Avg3(c, d, e);
BinaryPrimitives.WriteUInt32BigEndian(dst.Slice(2 * WebpConstants.Bps), val);
val = 0x01010101U * Avg3(d, e, e);
BinaryPrimitives.WriteUInt32BigEndian(dst.Slice(3 * WebpConstants.Bps), val);
}
public static void RD4(Span<byte> dst, Span<byte> yuv, int offset)
{
// Down-right
int offsetMinusOne = offset - 1;
byte i = yuv[offsetMinusOne];
byte j = yuv[offsetMinusOne + (1 * WebpConstants.Bps)];
byte k = yuv[offsetMinusOne + (2 * WebpConstants.Bps)];
byte l = yuv[offsetMinusOne + (3 * WebpConstants.Bps)];
byte x = yuv[offsetMinusOne - WebpConstants.Bps];
byte a = yuv[offset - WebpConstants.Bps];
byte b = yuv[offset + 1 - WebpConstants.Bps];
byte c = yuv[offset + 2 - WebpConstants.Bps];
byte d = yuv[offset + 3 - WebpConstants.Bps];
Dst(dst, 0, 3, Avg3(j, k, l));
byte ijk = Avg3(i, j, k);
Dst(dst, 1, 3, ijk);
Dst(dst, 0, 2, ijk);
byte xij = Avg3(x, i, j);
Dst(dst, 2, 3, xij);
Dst(dst, 1, 2, xij);
Dst(dst, 0, 1, xij);
byte axi = Avg3(a, x, i);
Dst(dst, 3, 3, axi);
Dst(dst, 2, 2, axi);
Dst(dst, 1, 1, axi);
Dst(dst, 0, 0, axi);
byte bax = Avg3(b, a, x);
Dst(dst, 3, 2, bax);
Dst(dst, 2, 1, bax);
Dst(dst, 1, 0, bax);
byte cba = Avg3(c, b, a);
Dst(dst, 3, 1, cba);
Dst(dst, 2, 0, cba);
Dst(dst, 3, 0, Avg3(d, c, b));
}
public static void VR4(Span<byte> dst, Span<byte> yuv, int offset)
{
// Vertical-Right
int offsetMinusOne = offset - 1;
byte i = yuv[offsetMinusOne];
byte j = yuv[offsetMinusOne + (1 * WebpConstants.Bps)];
byte k = yuv[offsetMinusOne + (2 * WebpConstants.Bps)];
byte x = yuv[offsetMinusOne - WebpConstants.Bps];
byte a = yuv[offset - WebpConstants.Bps];
byte b = yuv[offset + 1 - WebpConstants.Bps];
byte c = yuv[offset + 2 - WebpConstants.Bps];
byte d = yuv[offset + 3 - WebpConstants.Bps];
byte xa = Avg2(x, a);
Dst(dst, 0, 0, xa);
Dst(dst, 1, 2, xa);
byte ab = Avg2(a, b);
Dst(dst, 1, 0, ab);
Dst(dst, 2, 2, ab);
byte bc = Avg2(b, c);
Dst(dst, 2, 0, bc);
Dst(dst, 3, 2, bc);
Dst(dst, 3, 0, Avg2(c, d));
Dst(dst, 0, 3, Avg3(k, j, i));
Dst(dst, 0, 2, Avg3(j, i, x));
byte ixa = Avg3(i, x, a);
Dst(dst, 0, 1, ixa);
Dst(dst, 1, 3, ixa);
byte xab = Avg3(x, a, b);
Dst(dst, 1, 1, xab);
Dst(dst, 2, 3, xab);
byte abc = Avg3(a, b, c);
Dst(dst, 2, 1, abc);
Dst(dst, 3, 3, abc);
Dst(dst, 3, 1, Avg3(b, c, d));
}
public static void LD4(Span<byte> dst, Span<byte> yuv, int offset)
{
// Down-Left
byte a = yuv[offset - WebpConstants.Bps];
byte b = yuv[offset + 1 - WebpConstants.Bps];
byte c = yuv[offset + 2 - WebpConstants.Bps];
byte d = yuv[offset + 3 - WebpConstants.Bps];
byte e = yuv[offset + 4 - WebpConstants.Bps];
byte f = yuv[offset + 5 - WebpConstants.Bps];
byte g = yuv[offset + 6 - WebpConstants.Bps];
byte h = yuv[offset + 7 - WebpConstants.Bps];
Dst(dst, 0, 0, Avg3(a, b, c));
byte bcd = Avg3(b, c, d);
Dst(dst, 1, 0, bcd);
Dst(dst, 0, 1, bcd);
byte cde = Avg3(c, d, e);
Dst(dst, 2, 0, cde);
Dst(dst, 1, 1, cde);
Dst(dst, 0, 2, cde);
byte def = Avg3(d, e, f);
Dst(dst, 3, 0, def);
Dst(dst, 2, 1, def);
Dst(dst, 1, 2, def);
Dst(dst, 0, 3, def);
byte efg = Avg3(e, f, g);
Dst(dst, 3, 1, efg);
Dst(dst, 2, 2, efg);
Dst(dst, 1, 3, efg);
byte fgh = Avg3(f, g, h);
Dst(dst, 3, 2, fgh);
Dst(dst, 2, 3, fgh);
Dst(dst, 3, 3, Avg3(g, h, h));
}
public static void VL4(Span<byte> dst, Span<byte> yuv, int offset)
{
// Vertical-Left
byte a = yuv[offset - WebpConstants.Bps];
byte b = yuv[offset + 1 - WebpConstants.Bps];
byte c = yuv[offset + 2 - WebpConstants.Bps];
byte d = yuv[offset + 3 - WebpConstants.Bps];
byte e = yuv[offset + 4 - WebpConstants.Bps];
byte f = yuv[offset + 5 - WebpConstants.Bps];
byte g = yuv[offset + 6 - WebpConstants.Bps];
byte h = yuv[offset + 7 - WebpConstants.Bps];
Dst(dst, 0, 0, Avg2(a, b));
byte bc = Avg2(b, c);
Dst(dst, 1, 0, bc);
Dst(dst, 0, 2, bc);
byte cd = Avg2(c, d);
Dst(dst, 2, 0, cd);
Dst(dst, 1, 2, cd);
byte de = Avg2(d, e);
Dst(dst, 3, 0, de);
Dst(dst, 2, 2, de);
Dst(dst, 0, 1, Avg3(a, b, c));
byte bcd = Avg3(b, c, d);
Dst(dst, 1, 1, bcd);
Dst(dst, 0, 3, bcd);
byte cde = Avg3(c, d, e);
Dst(dst, 2, 1, cde);
Dst(dst, 1, 3, cde);
byte def = Avg3(d, e, f);
Dst(dst, 3, 1, def);
Dst(dst, 2, 3, def);
Dst(dst, 3, 2, Avg3(e, f, g));
Dst(dst, 3, 3, Avg3(f, g, h));
}
public static void HD4(Span<byte> dst, Span<byte> yuv, int offset)
{
// Horizontal-Down
byte i = yuv[offset - 1];
byte j = yuv[offset - 1 + (1 * WebpConstants.Bps)];
byte k = yuv[offset - 1 + (2 * WebpConstants.Bps)];
byte l = yuv[offset - 1 + (3 * WebpConstants.Bps)];
byte x = yuv[offset - 1 - WebpConstants.Bps];
byte a = yuv[offset - WebpConstants.Bps];
byte b = yuv[offset + 1 - WebpConstants.Bps];
byte c = yuv[offset + 2 - WebpConstants.Bps];
byte ix = Avg2(i, x);
Dst(dst, 0, 0, ix);
Dst(dst, 2, 1, ix);
byte ji = Avg2(j, i);
Dst(dst, 0, 1, ji);
Dst(dst, 2, 2, ji);
byte kj = Avg2(k, j);
Dst(dst, 0, 2, kj);
Dst(dst, 2, 3, kj);
Dst(dst, 0, 3, Avg2(l, k));
Dst(dst, 3, 0, Avg3(a, b, c));
Dst(dst, 2, 0, Avg3(x, a, b));
byte ixa = Avg3(i, x, a);
Dst(dst, 1, 0, ixa);
Dst(dst, 3, 1, ixa);
byte jix = Avg3(j, i, x);
Dst(dst, 1, 1, jix);
Dst(dst, 3, 2, jix);
byte kji = Avg3(k, j, i);
Dst(dst, 1, 2, kji);
Dst(dst, 3, 3, kji);
Dst(dst, 1, 3, Avg3(l, k, j));
}
public static void HU4(Span<byte> dst, Span<byte> yuv, int offset)
{
// Horizontal-Up
byte i = yuv[offset - 1];
byte j = yuv[offset - 1 + (1 * WebpConstants.Bps)];
byte k = yuv[offset - 1 + (2 * WebpConstants.Bps)];
byte l = yuv[offset - 1 + (3 * WebpConstants.Bps)];
Dst(dst, 0, 0, Avg2(i, j));
byte jk = Avg2(j, k);
Dst(dst, 2, 0, jk);
Dst(dst, 0, 1, jk);
byte kl = Avg2(k, l);
Dst(dst, 2, 1, kl);
Dst(dst, 0, 2, kl);
Dst(dst, 1, 0, Avg3(i, j, k));
byte jkl = Avg3(j, k, l);
Dst(dst, 3, 0, jkl);
Dst(dst, 1, 1, jkl);
byte kll = Avg3(k, l, l);
Dst(dst, 3, 1, kll);
Dst(dst, 1, 2, kll);
Dst(dst, 3, 2, l);
Dst(dst, 2, 2, l);
Dst(dst, 0, 3, l);
Dst(dst, 1, 3, l);
Dst(dst, 2, 3, l);
Dst(dst, 3, 3, l);
}
/// <summary>
/// Paragraph 14.3: Implementation of the Walsh-Hadamard transform inversion.
/// </summary>
public static void TransformWht(Span<short> input, Span<short> output, Span<int> scratch)
{
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
for (int i = 0; i < 4; i++)
{
int iPlus4 = 4 + i;
int iPlus8 = 8 + i;
int iPlus12 = 12 + i;
int a0 = input[i] + input[iPlus12];
int a1 = input[iPlus4] + input[iPlus8];
int a2 = input[iPlus4] - input[iPlus8];
int a3 = input[i] - input[iPlus12];
tmp[i] = a0 + a1;
tmp[iPlus8] = a0 - a1;
tmp[iPlus4] = a3 + a2;
tmp[iPlus12] = a3 - a2;
}
int outputOffset = 0;
for (int i = 0; i < 4; i++)
{
int imul4 = i * 4;
int dc = tmp[0 + imul4] + 3;
int a0 = dc + tmp[3 + imul4];
int a1 = tmp[1 + imul4] + tmp[2 + imul4];
int a2 = tmp[1 + imul4] - tmp[2 + imul4];
int a3 = dc - tmp[3 + imul4];
output[outputOffset + 0] = (short)((a0 + a1) >> 3);
output[outputOffset + 16] = (short)((a3 + a2) >> 3);
output[outputOffset + 32] = (short)((a0 - a1) >> 3);
output[outputOffset + 48] = (short)((a3 - a2) >> 3);
outputOffset += 64;
}
}
/// <summary>
/// Hadamard transform
/// Returns the weighted sum of the absolute value of transformed coefficients.
/// w[] contains a row-major 4 by 4 symmetric matrix.
/// </summary>
public static int TTransform(Span<byte> input, Span<ushort> w, Span<int> scratch)
{
int sum = 0;
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
// horizontal pass.
int inputOffset = 0;
for (int i = 0; i < 4; i++)
{
int inputOffsetPlusOne = inputOffset + 1;
int inputOffsetPlusTwo = inputOffset + 2;
int inputOffsetPlusThree = inputOffset + 3;
int a0 = input[inputOffset] + input[inputOffsetPlusTwo];
int a1 = input[inputOffsetPlusOne] + input[inputOffsetPlusThree];
int a2 = input[inputOffsetPlusOne] - input[inputOffsetPlusThree];
int a3 = input[inputOffset] - input[inputOffsetPlusTwo];
tmp[0 + (i * 4)] = a0 + a1;
tmp[1 + (i * 4)] = a3 + a2;
tmp[2 + (i * 4)] = a3 - a2;
tmp[3 + (i * 4)] = a0 - a1;
inputOffset += WebpConstants.Bps;
}
// vertical pass
for (int i = 0; i < 4; i++)
{
int a0 = tmp[0 + i] + tmp[8 + i];
int a1 = tmp[4 + i] + tmp[12 + i];
int a2 = tmp[4 + i] - tmp[12 + i];
int a3 = tmp[0 + i] - tmp[8 + i];
int b0 = a0 + a1;
int b1 = a3 + a2;
int b2 = a3 - a2;
int b3 = a0 - a1;
sum += w[0] * Math.Abs(b0);
sum += w[4] * Math.Abs(b1);
sum += w[8] * Math.Abs(b2);
sum += w[12] * Math.Abs(b3);
w = w.Slice(1);
}
return sum;
}
#if SUPPORTS_RUNTIME_INTRINSICS
/// <summary>
/// Hadamard transform
/// Returns the weighted sum of the absolute value of transformed coefficients.
/// w[] contains a row-major 4 by 4 symmetric matrix.
/// </summary>
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w)
{
// Load and combine inputs.
Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
Vector128<byte> ina2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 2, 16)));
Vector128<long> ina3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
Vector128<byte> inb0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB));
Vector128<byte> inb1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps, 16)));
Vector128<byte> inb2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 2, 16)));
Vector128<long> inb3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
// Combine inA and inB (we'll do two transforms in parallel).
Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Vertical pass first to avoid a transpose (vertical and horizontal passes
// are commutative because w/kWeightY is symmetric) and subsequent transpose.
// Calculate a and b (two 4x4 at once).
Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
Vector128<short> b0 = Sse2.Add(a0, a1);
Vector128<short> b1 = Sse2.Add(a3, a2);
Vector128<short> b2 = Sse2.Subtract(a3, a2);
Vector128<short> b3 = Sse2.Subtract(a0, a1);
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Transpose the two 4x4.
Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128<long> output0, out Vector128<long> output1, out Vector128<long> output2, out Vector128<long> output3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
// Horizontal pass and difference of weighted sums.
Vector128<ushort> w0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w));
Vector128<ushort> w8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w.Slice(8, 8)));
// Calculate a and b (two 4x4 at once).
a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
b0 = Sse2.Add(a0, a1);
b1 = Sse2.Add(a3, a2);
b2 = Sse2.Subtract(a3, a2);
b3 = Sse2.Subtract(a0, a1);
// Separate the transforms of inA and inB.
Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
// weighted sums.
Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
// difference of weighted sums.
Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
return Numerics.ReduceSum(result);
}
// Transpose two 4x4 16b matrices horizontally stored in registers.
[MethodImpl(InliningOptions.ShortMethod)]
public static void Vp8Transpose_2_4x4_16b(Vector128<short> b0, Vector128<short> b1, Vector128<short> b2, Vector128<short> b3, out Vector128<long> output0, out Vector128<long> output1, out Vector128<long> output2, out Vector128<long> output3)
{
// Transpose the two 4x4.
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
#endif
public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scratch)
{
TransformOne(src, dst, scratch);
TransformOne(src.Slice(16), dst.Slice(4), scratch);
}
public static void TransformOne(Span<short> src, Span<byte> dst, Span<int> scratch)
{
Span<int> tmp = scratch.Slice(0, 16);
int tmpOffset = 0;
for (int srcOffset = 0; srcOffset < 4; srcOffset++)
{
// vertical pass
int srcOffsetPlus4 = srcOffset + 4;
int srcOffsetPlus8 = srcOffset + 8;
int srcOffsetPlus12 = srcOffset + 12;
int a = src[srcOffset] + src[srcOffsetPlus8];
int b = src[srcOffset] - src[srcOffsetPlus8];
int c = Mul2(src[srcOffsetPlus4]) - Mul1(src[srcOffsetPlus12]);
int d = Mul1(src[srcOffsetPlus4]) + Mul2(src[srcOffsetPlus12]);
tmp[tmpOffset++] = a + d;
tmp[tmpOffset++] = b + c;
tmp[tmpOffset++] = b - c;
tmp[tmpOffset++] = a - d;
}
// Each pass is expanding the dynamic range by ~3.85 (upper bound).
// The exact value is (2. + (20091 + 35468) / 65536).
// After the second pass, maximum interval is [-3794, 3794], assuming
// an input in [-2048, 2047] interval. We then need to add a dst value in the [0, 255] range.
// In the worst case scenario, the input to clip_8b() can be as large as [-60713, 60968].
tmpOffset = 0;
int dstOffset = 0;
for (int i = 0; i < 4; i++)
{
// horizontal pass
int tmpOffsetPlus4 = tmpOffset + 4;
int tmpOffsetPlus8 = tmpOffset + 8;
int tmpOffsetPlus12 = tmpOffset + 12;
int dc = tmp[tmpOffset] + 4;
int a = dc + tmp[tmpOffsetPlus8];
int b = dc - tmp[tmpOffsetPlus8];
int c = Mul2(tmp[tmpOffsetPlus4]) - Mul1(tmp[tmpOffsetPlus12]);
int d = Mul1(tmp[tmpOffsetPlus4]) + Mul2(tmp[tmpOffsetPlus12]);
Store(dst.Slice(dstOffset), 0, 0, a + d);
Store(dst.Slice(dstOffset), 1, 0, b + c);
Store(dst.Slice(dstOffset), 2, 0, b - c);
Store(dst.Slice(dstOffset), 3, 0, a - d);
tmpOffset++;
dstOffset += WebpConstants.Bps;
}
}
public static void TransformDc(Span<short> src, Span<byte> dst)
{
int dc = src[0] + 4;
for (int j = 0; j < 4; j++)
{
for (int i = 0; i < 4; i++)
{
Store(dst, i, j, dc);
}
}
}
// Simplified transform when only src[0], src[1] and src[4] are non-zero
public static void TransformAc3(Span<short> src, Span<byte> dst)
{
int a = src[0] + 4;
int c4 = Mul2(src[4]);
int d4 = Mul1(src[4]);
int c1 = Mul2(src[1]);
int d1 = Mul1(src[1]);
Store2(dst, 0, a + d4, d1, c1);
Store2(dst, 1, a + c4, d1, c1);
Store2(dst, 2, a - c4, d1, c1);
Store2(dst, 3, a - d4, d1, c1);
}
public static void TransformUv(Span<short> src, Span<byte> dst, Span<int> scratch)
{
TransformTwo(src.Slice(0 * 16), dst, scratch);
TransformTwo(src.Slice(2 * 16), dst.Slice(4 * WebpConstants.Bps), scratch);
}
public static void TransformDcuv(Span<short> src, Span<byte> dst)
{
if (src[0 * 16] != 0)
{
TransformDc(src.Slice(0 * 16), dst);
}
if (src[1 * 16] != 0)
{
TransformDc(src.Slice(1 * 16), dst.Slice(4));
}
if (src[2 * 16] != 0)
{
TransformDc(src.Slice(2 * 16), dst.Slice(4 * WebpConstants.Bps));
}
if (src[3 * 16] != 0)
{
TransformDc(src.Slice(3 * 16), dst.Slice((4 * WebpConstants.Bps) + 4));
}
}
// Simple In-loop filtering (Paragraph 15.2)
public static void SimpleVFilter16(Span<byte> p, int offset, int stride, int thresh)
{
int thresh2 = (2 * thresh) + 1;
int end = 16 + offset;
for (int i = offset; i < end; i++)
{
if (NeedsFilter(p, i, stride, thresh2))
{
DoFilter2(p, i, stride);
}
}
}
public static void SimpleHFilter16(Span<byte> p, int offset, int stride, int thresh)
{
int thresh2 = (2 * thresh) + 1;
int end = offset + (16 * stride);
for (int i = offset; i < end; i += stride)
{
if (NeedsFilter(p, i, 1, thresh2))
{
DoFilter2(p, i, 1);
}
}
}
public static void SimpleVFilter16i(Span<byte> p, int offset, int stride, int thresh)
{
for (int k = 3; k > 0; --k)
{
offset += 4 * stride;
SimpleVFilter16(p, offset, stride, thresh);
}
}
public static void SimpleHFilter16i(Span<byte> p, int offset, int stride, int thresh)
{
for (int k = 3; k > 0; --k)
{
offset += 4;
SimpleHFilter16(p, offset, stride, thresh);
}
}
[MethodImpl(InliningOptions.ShortMethod)]
public static void VFilter16(Span<byte> p, int offset, int stride, int thresh, int ithresh, int hevThresh)
=> FilterLoop26(p, offset, stride, 1, 16, thresh, ithresh, hevThresh);
[MethodImpl(InliningOptions.ShortMethod)]
public static void HFilter16(Span<byte> p, int offset, int stride, int thresh, int ithresh, int hevThresh)
=> FilterLoop26(p, offset, 1, stride, 16, thresh, ithresh, hevThresh);
public static void VFilter16i(Span<byte> p, int offset, int stride, int thresh, int ithresh, int hevThresh)
{
for (int k = 3; k > 0; --k)
{
offset += 4 * stride;
FilterLoop24(p, offset, stride, 1, 16, thresh, ithresh, hevThresh);
}
}
public static void HFilter16i(Span<byte> p, int offset, int stride, int thresh, int ithresh, int hevThresh)
{
for (int k = 3; k > 0; --k)
{
offset += 4;
FilterLoop24(p, offset, 1, stride, 16, thresh, ithresh, hevThresh);
}
}
// 8-pixels wide variant, for chroma filtering.
[MethodImpl(InliningOptions.ShortMethod)]
public static void VFilter8(Span<byte> u, Span<byte> v, int offset, int stride, int thresh, int ithresh, int hevThresh)
{
FilterLoop26(u, offset, stride, 1, 8, thresh, ithresh, hevThresh);
FilterLoop26(v, offset, stride, 1, 8, thresh, ithresh, hevThresh);
}
[MethodImpl(InliningOptions.ShortMethod)]
public static void HFilter8(Span<byte> u, Span<byte> v, int offset, int stride, int thresh, int ithresh, int hevThresh)
{
FilterLoop26(u, offset, 1, stride, 8, thresh, ithresh, hevThresh);
FilterLoop26(v, offset, 1, stride, 8, thresh, ithresh, hevThresh);
}
[MethodImpl(InliningOptions.ShortMethod)]
public static void VFilter8i(Span<byte> u, Span<byte> v, int offset, int stride, int thresh, int ithresh, int hevThresh)
{
int offset4mulstride = offset + (4 * stride);
FilterLoop24(u, offset4mulstride, stride, 1, 8, thresh, ithresh, hevThresh);
FilterLoop24(v, offset4mulstride, stride, 1, 8, thresh, ithresh, hevThresh);
}
[MethodImpl(InliningOptions.ShortMethod)]
public static void HFilter8i(Span<byte> u, Span<byte> v, int offset, int stride, int thresh, int ithresh, int hevThresh)
{
int offsetPlus4 = offset + 4;
FilterLoop24(u, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh);
FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh);
}
public static void Mean16x4(Span<byte> input, Span<uint> dc)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Ssse3.IsSupported)
{
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input));
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16)));
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16)));
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 3, 16)));
Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte
Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8);
Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8);
Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8);
Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte
Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask);
Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask);
Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask);
Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32());
Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32());
Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32());
Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32());
Vector128<int> e0 = Sse2.Add(d0, d1);
Vector128<int> e1 = Sse2.Add(d2, d3);
Vector128<int> f0 = Sse2.Add(e0, e1);
Vector128<short> hadd = Ssse3.HorizontalAdd(f0.AsInt16(), f0.AsInt16());
Vector128<uint> wide = Sse2.UnpackLow(hadd, Vector128<short>.Zero).AsUInt32();
ref uint outputRef = ref MemoryMarshal.GetReference(dc);
Unsafe.As<uint, Vector128<uint>>(ref outputRef) = wide;
}
else
#endif
{
for (int k = 0; k < 4; k++)
{
uint avg = 0;
for (int y = 0; y < 4; y++)
{
for (int x = 0; x < 4; x++)
{
avg += input[x + (y * WebpConstants.Bps)];
}
}
dc[k] = avg;
input = input.Slice(4); // go to next 4x4 block.
}
}
}
[MethodImpl(InliningOptions.ShortMethod)]
public static byte Avg2(byte a, byte b) => (byte)((a + b + 1) >> 1);
[MethodImpl(InliningOptions.ShortMethod)]
public static byte Avg3(byte a, byte b, byte c) => (byte)((a + (2 * b) + c + 2) >> 2);
[MethodImpl(InliningOptions.ShortMethod)]
public static void Dst(Span<byte> dst, int x, int y, byte v) => dst[x + (y * WebpConstants.Bps)] = v;
[MethodImpl(InliningOptions.ShortMethod)]
public static byte Clip8B(int v) => (byte)((v & ~0xff) == 0 ? v : v < 0 ? 0 : 255);
// Cost of coding one event with probability 'proba'.
public static int Vp8BitCost(int bit, byte proba) => bit == 0 ? WebpLookupTables.Vp8EntropyCost[proba] : WebpLookupTables.Vp8EntropyCost[255 - proba];
[MethodImpl(InliningOptions.ShortMethod)]
private static void Put16(int v, Span<byte> dst)
{
for (int j = 0; j < 16; j++)
{
Memset(dst.Slice(j * WebpConstants.Bps), (byte)v, 0, 16);
}
}
private static void TrueMotion(Span<byte> dst, Span<byte> yuv, int offset, int size)
{
// For information about how true motion works, see rfc6386, page 52. ff and section 20.14.
int topOffset = offset - WebpConstants.Bps;
Span<byte> top = yuv.Slice(topOffset);
byte p = yuv[topOffset - 1];
int leftOffset = offset - 1;
byte left = yuv[leftOffset];
for (int y = 0; y < size; y++)
{
for (int x = 0; x < size; x++)
{
dst[x] = (byte)Clamp255(left + top[x] - p);
}
leftOffset += WebpConstants.Bps;
left = yuv[leftOffset];
dst = dst.Slice(WebpConstants.Bps);
}
}
// Complex In-loop filtering (Paragraph 15.3)
private static void FilterLoop24(
Span<byte> p,
int offset,
int hStride,
int vStride,
int size,
int thresh,
int ithresh,
int hevThresh)
{
int thresh2 = (2 * thresh) + 1;
while (size-- > 0)
{
if (NeedsFilter2(p, offset, hStride, thresh2, ithresh))
{
if (Hev(p, offset, hStride, hevThresh))
{
DoFilter2(p, offset, hStride);
}
else
{
DoFilter4(p, offset, hStride);
}
}
offset += vStride;
}
}
private static void FilterLoop26(
Span<byte> p,
int offset,
int hStride,
int vStride,
int size,
int thresh,
int ithresh,
int hevThresh)
{
int thresh2 = (2 * thresh) + 1;
while (size-- > 0)
{
if (NeedsFilter2(p, offset, hStride, thresh2, ithresh))
{
if (Hev(p, offset, hStride, hevThresh))
{
DoFilter2(p, offset, hStride);
}
else
{
DoFilter6(p, offset, hStride);
}
}
offset += vStride;
}
}
private static void DoFilter2(Span<byte> p, int offset, int step)
{
// 4 pixels in, 2 pixels out.
int p1 = p[offset - (2 * step)];
int p0 = p[offset - step];
int q0 = p[offset];
int q1 = p[offset + step];
int a = (3 * (q0 - p0)) + WebpLookupTables.Sclip1(p1 - q1);
int a1 = WebpLookupTables.Sclip2((a + 4) >> 3);
int a2 = WebpLookupTables.Sclip2((a + 3) >> 3);
p[offset - step] = WebpLookupTables.Clip1(p0 + a2);
p[offset] = WebpLookupTables.Clip1(q0 - a1);
}
private static void DoFilter4(Span<byte> p, int offset, int step)
{
// 4 pixels in, 4 pixels out.
int offsetMinus2Step = offset - (2 * step);
int p1 = p[offsetMinus2Step];
int p0 = p[offset - step];
int q0 = p[offset];
int q1 = p[offset + step];
int a = 3 * (q0 - p0);
int a1 = WebpLookupTables.Sclip2((a + 4) >> 3);
int a2 = WebpLookupTables.Sclip2((a + 3) >> 3);
int a3 = (a1 + 1) >> 1;
p[offsetMinus2Step] = WebpLookupTables.Clip1(p1 + a3);
p[offset - step] = WebpLookupTables.Clip1(p0 + a2);
p[offset] = WebpLookupTables.Clip1(q0 - a1);
p[offset + step] = WebpLookupTables.Clip1(q1 - a3);
}
private static void DoFilter6(Span<byte> p, int offset, int step)
{
// 6 pixels in, 6 pixels out.
int step2 = 2 * step;
int step3 = 3 * step;
int offsetMinusStep = offset - step;
int p2 = p[offset - step3];
int p1 = p[offset - step2];
int p0 = p[offsetMinusStep];
int q0 = p[offset];
int q1 = p[offset + step];
int q2 = p[offset + step2];
int a = WebpLookupTables.Sclip1((3 * (q0 - p0)) + WebpLookupTables.Sclip1(p1 - q1));
// a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
int a1 = ((27 * a) + 63) >> 7; // eq. to ((3 * a + 7) * 9) >> 7
int a2 = ((18 * a) + 63) >> 7; // eq. to ((2 * a + 7) * 9) >> 7
int a3 = ((9 * a) + 63) >> 7; // eq. to ((1 * a + 7) * 9) >> 7
p[offset - step3] = WebpLookupTables.Clip1(p2 + a3);
p[offset - step2] = WebpLookupTables.Clip1(p1 + a2);
p[offsetMinusStep] = WebpLookupTables.Clip1(p0 + a1);
p[offset] = WebpLookupTables.Clip1(q0 - a1);
p[offset + step] = WebpLookupTables.Clip1(q1 - a2);
p[offset + step2] = WebpLookupTables.Clip1(q2 - a3);
}
[MethodImpl(InliningOptions.ShortMethod)]
private static bool NeedsFilter(Span<byte> p, int offset, int step, int t)
{
int p1 = p[offset + (-2 * step)];
int p0 = p[offset - step];
int q0 = p[offset];
int q1 = p[offset + step];
return (4 * WebpLookupTables.Abs0(p0 - q0)) + WebpLookupTables.Abs0(p1 - q1) <= t;
}
private static bool NeedsFilter2(Span<byte> p, int offset, int step, int t, int it)
{
int step2 = 2 * step;
int step3 = 3 * step;
int p3 = p[offset - (4 * step)];
int p2 = p[offset - step3];
int p1 = p[offset - step2];
int p0 = p[offset - step];
int q0 = p[offset];
int q1 = p[offset + step];
int q2 = p[offset + step2];
int q3 = p[offset + step3];
if ((4 * WebpLookupTables.Abs0(p0 - q0)) + WebpLookupTables.Abs0(p1 - q1) > t)
{
return false;
}
return WebpLookupTables.Abs0(p3 - p2) <= it && WebpLookupTables.Abs0(p2 - p1) <= it &&
WebpLookupTables.Abs0(p1 - p0) <= it && WebpLookupTables.Abs0(q3 - q2) <= it &&
WebpLookupTables.Abs0(q2 - q1) <= it && WebpLookupTables.Abs0(q1 - q0) <= it;
}
[MethodImpl(InliningOptions.ShortMethod)]
private static bool Hev(Span<byte> p, int offset, int step, int thresh)
{
int p1 = p[offset - (2 * step)];
int p0 = p[offset - step];
int q0 = p[offset];
int q1 = p[offset + step];
return WebpLookupTables.Abs0(p1 - p0) > thresh || WebpLookupTables.Abs0(q1 - q0) > thresh;
}
[MethodImpl(InliningOptions.ShortMethod)]
private static void Store(Span<byte> dst, int x, int y, int v)
{
int index = x + (y * WebpConstants.Bps);
dst[index] = Clip8B(dst[index] + (v >> 3));
}
[MethodImpl(InliningOptions.ShortMethod)]
private static void Store2(Span<byte> dst, int y, int dc, int d, int c)
{
Store(dst, 0, y, dc + d);
Store(dst, 1, y, dc + c);
Store(dst, 2, y, dc - c);
Store(dst, 3, y, dc - d);
}
[MethodImpl(InliningOptions.ShortMethod)]
private static int Mul1(int a) => ((a * 20091) >> 16) + a;
[MethodImpl(InliningOptions.ShortMethod)]
private static int Mul2(int a) => (a * 35468) >> 16;
[MethodImpl(InliningOptions.ShortMethod)]
private static void Put8x8uv(byte value, Span<byte> dst)
{
int end = 8 * WebpConstants.Bps;
for (int j = 0; j < end; j += WebpConstants.Bps)
{
// memset(dst + j * BPS, value, 8);
Memset(dst, value, j, 8);
}
}
[MethodImpl(InliningOptions.ShortMethod)]
private static void Memset(Span<byte> dst, byte value, int startIdx, int count)
{
int end = startIdx + count;
for (int i = startIdx; i < end; i++)
{
dst[i] = value;
}
}
[MethodImpl(InliningOptions.ShortMethod)]
private static int Clamp255(int x) => x < 0 ? 0 : x > 255 ? 255 : x;
}
}