|
|
|
@ -15,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
/// <summary>
|
|
|
|
/// Methods for encoding a VP8 frame.
|
|
|
|
/// </summary>
|
|
|
|
internal static class Vp8Encoding |
|
|
|
internal static unsafe class Vp8Encoding |
|
|
|
{ |
|
|
|
private const int KC1 = 20091 + (1 << 16); |
|
|
|
|
|
|
|
@ -66,11 +66,39 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 }; |
|
|
|
|
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
public static readonly Vector128<short> K1 = Vector128.Create((short)20091).AsInt16(); |
|
|
|
#pragma warning disable SA1310 // Field names should not contain underscore
|
|
|
|
private static readonly Vector128<short> K1 = Vector128.Create((short)20091).AsInt16(); |
|
|
|
|
|
|
|
public static readonly Vector128<short> K2 = Vector128.Create((short)-30068).AsInt16(); |
|
|
|
private static readonly Vector128<short> K2 = Vector128.Create((short)-30068).AsInt16(); |
|
|
|
|
|
|
|
public static readonly Vector128<short> Four = Vector128.Create((short)4); |
|
|
|
private static readonly Vector128<short> Four = Vector128.Create((short)4); |
|
|
|
|
|
|
|
private static readonly Vector128<short> Seven = Vector128.Create((short)7); |
|
|
|
|
|
|
|
private static readonly Vector128<short> K88p = Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16(); |
|
|
|
|
|
|
|
private static readonly Vector128<short> K88m = Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16(); |
|
|
|
|
|
|
|
private static readonly Vector128<short> K5352_2217p = Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16(); |
|
|
|
|
|
|
|
private static readonly Vector128<short> K5352_2217m = Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16(); |
|
|
|
|
|
|
|
private static readonly Vector128<int> K937 = Vector128.Create(937); |
|
|
|
|
|
|
|
private static readonly Vector128<int> K1812 = Vector128.Create(1812); |
|
|
|
|
|
|
|
private static readonly Vector128<short> K5352_2217 = Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16(); |
|
|
|
|
|
|
|
private static readonly Vector128<short> K2217_5352 = Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16(); |
|
|
|
|
|
|
|
private static readonly Vector128<int> K12000PlusOne = Vector128.Create(12000 + (1 << 16)); |
|
|
|
|
|
|
|
private static readonly Vector128<int> K51000 = Vector128.Create(51000); |
|
|
|
|
|
|
|
private static readonly byte MmShuffle2301 = SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1); |
|
|
|
|
|
|
|
private static readonly byte MmShuffle1032 = SimdUtils.Shuffle.MmShuffle(1, 0, 3, 2); |
|
|
|
#pragma warning restore SA1310 // Field names should not contain underscore
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static Vp8Encoding() |
|
|
|
@ -376,49 +404,246 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
|
|
|
|
public static void FTransform2(Span<byte> src, Span<byte> reference, Span<short> output, Span<short> output2, Span<int> scratch) |
|
|
|
{ |
|
|
|
FTransform(src, reference, output, scratch); |
|
|
|
FTransform(src.Slice(4), reference.Slice(4), output2, scratch); |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
if (Sse2.IsSupported) |
|
|
|
{ |
|
|
|
ref byte srcRef = ref MemoryMarshal.GetReference(src); |
|
|
|
ref byte referenceRef = ref MemoryMarshal.GetReference(reference); |
|
|
|
|
|
|
|
// Load src.
|
|
|
|
var src0 = Vector128.Create(Unsafe.As<byte, long>(ref srcRef), 0); |
|
|
|
var src1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps)), 0); |
|
|
|
var src2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 2)), 0); |
|
|
|
var src3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 3)), 0); |
|
|
|
|
|
|
|
// Load ref.
|
|
|
|
var ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0); |
|
|
|
var ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0); |
|
|
|
var ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0); |
|
|
|
var ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0); |
|
|
|
|
|
|
|
// Convert both to 16 bit.
|
|
|
|
Vector128<byte> srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128<byte>.Zero); |
|
|
|
Vector128<byte> srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128<byte>.Zero); |
|
|
|
Vector128<byte> srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128<byte>.Zero); |
|
|
|
Vector128<byte> srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128<byte>.Zero); |
|
|
|
Vector128<byte> refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero); |
|
|
|
Vector128<byte> refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero); |
|
|
|
Vector128<byte> refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero); |
|
|
|
Vector128<byte> refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero); |
|
|
|
|
|
|
|
// Compute difference. -> 00 01 02 03 00' 01' 02' 03'
|
|
|
|
Vector128<short> diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16()); |
|
|
|
Vector128<short> diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16()); |
|
|
|
Vector128<short> diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16()); |
|
|
|
Vector128<short> diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16()); |
|
|
|
|
|
|
|
// Unpack and shuffle.
|
|
|
|
// 00 01 02 03 0 0 0 0
|
|
|
|
// 10 11 12 13 0 0 0 0
|
|
|
|
// 20 21 22 23 0 0 0 0
|
|
|
|
// 30 31 32 33 0 0 0 0
|
|
|
|
Vector128<int> shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32()); |
|
|
|
Vector128<int> shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32()); |
|
|
|
Vector128<int> shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32()); |
|
|
|
Vector128<int> shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32()); |
|
|
|
|
|
|
|
// First pass.
|
|
|
|
FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l); |
|
|
|
FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h); |
|
|
|
|
|
|
|
// Second pass.
|
|
|
|
FTransformPass2SSE2(v01l, v32l, output); |
|
|
|
FTransformPass2SSE2(v01h, v32h, output2); |
|
|
|
} |
|
|
|
else |
|
|
|
#endif
|
|
|
|
{ |
|
|
|
FTransform(src, reference, output, scratch); |
|
|
|
FTransform(src.Slice(4), reference.Slice(4), output2, scratch); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
public static void FTransform(Span<byte> src, Span<byte> reference, Span<short> output, Span<int> scratch) |
|
|
|
{ |
|
|
|
int i; |
|
|
|
Span<int> tmp = scratch.Slice(0, 16); |
|
|
|
|
|
|
|
int srcIdx = 0; |
|
|
|
int refIdx = 0; |
|
|
|
for (i = 0; i < 4; i++) |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
if (Sse2.IsSupported) |
|
|
|
{ |
|
|
|
int d0 = src[srcIdx] - reference[refIdx]; // 9bit dynamic range ([-255,255])
|
|
|
|
int d1 = src[srcIdx + 1] - reference[refIdx + 1]; |
|
|
|
int d2 = src[srcIdx + 2] - reference[refIdx + 2]; |
|
|
|
int d3 = src[srcIdx + 3] - reference[refIdx + 3]; |
|
|
|
int a0 = d0 + d3; // 10b [-510,510]
|
|
|
|
int a1 = d1 + d2; |
|
|
|
int a2 = d1 - d2; |
|
|
|
int a3 = d0 - d3; |
|
|
|
tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160]
|
|
|
|
tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542]
|
|
|
|
tmp[2 + (i * 4)] = (a0 - a1) * 8; |
|
|
|
tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9; |
|
|
|
|
|
|
|
srcIdx += WebpConstants.Bps; |
|
|
|
refIdx += WebpConstants.Bps; |
|
|
|
} |
|
|
|
ref byte srcRef = ref MemoryMarshal.GetReference(src); |
|
|
|
ref byte referenceRef = ref MemoryMarshal.GetReference(reference); |
|
|
|
|
|
|
|
for (i = 0; i < 4; i++) |
|
|
|
// Load src.
|
|
|
|
var src0 = Vector128.Create(Unsafe.As<byte, long>(ref srcRef), 0); |
|
|
|
var src1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps)), 0); |
|
|
|
var src2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 2)), 0); |
|
|
|
var src3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 3)), 0); |
|
|
|
|
|
|
|
// Load ref.
|
|
|
|
var ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0); |
|
|
|
var ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0); |
|
|
|
var ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0); |
|
|
|
var ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0); |
|
|
|
|
|
|
|
// 00 01 02 03 *
|
|
|
|
// 10 11 12 13 *
|
|
|
|
// 20 21 22 23 *
|
|
|
|
// 30 31 32 33 *
|
|
|
|
// Shuffle.
|
|
|
|
Vector128<short> srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16()); |
|
|
|
Vector128<short> srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16()); |
|
|
|
Vector128<short> refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16()); |
|
|
|
Vector128<short> refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16()); |
|
|
|
|
|
|
|
// 00 01 10 11 02 03 12 13 * * ...
|
|
|
|
// 20 21 30 31 22 22 32 33 * * ...
|
|
|
|
|
|
|
|
// Convert both to 16 bit.
|
|
|
|
Vector128<byte> src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero); |
|
|
|
Vector128<byte> src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero); |
|
|
|
Vector128<byte> ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero); |
|
|
|
Vector128<byte> ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero); |
|
|
|
|
|
|
|
// Compute the difference.
|
|
|
|
Vector128<short> row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16()); |
|
|
|
Vector128<short> row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16()); |
|
|
|
|
|
|
|
// First pass.
|
|
|
|
FTransformPass1SSE2(row01, row23, out Vector128<int> v01, out Vector128<int> v32); |
|
|
|
|
|
|
|
// Second pass.
|
|
|
|
FTransformPass2SSE2(v01, v32, output); |
|
|
|
} |
|
|
|
else |
|
|
|
#endif
|
|
|
|
{ |
|
|
|
int a0 = tmp[0 + i] + tmp[12 + i]; // 15b
|
|
|
|
int a1 = tmp[4 + i] + tmp[8 + i]; |
|
|
|
int a2 = tmp[4 + i] - tmp[8 + i]; |
|
|
|
int a3 = tmp[0 + i] - tmp[12 + i]; |
|
|
|
output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b
|
|
|
|
output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0)); |
|
|
|
output[8 + i] = (short)((a0 - a1 + 7) >> 4); |
|
|
|
output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16); |
|
|
|
int i; |
|
|
|
Span<int> tmp = scratch.Slice(0, 16); |
|
|
|
|
|
|
|
int srcIdx = 0; |
|
|
|
int refIdx = 0; |
|
|
|
for (i = 0; i < 4; i++) |
|
|
|
{ |
|
|
|
int d3 = src[srcIdx + 3] - reference[refIdx + 3]; |
|
|
|
int d2 = src[srcIdx + 2] - reference[refIdx + 2]; |
|
|
|
int d1 = src[srcIdx + 1] - reference[refIdx + 1]; |
|
|
|
int d0 = src[srcIdx] - reference[refIdx]; // 9bit dynamic range ([-255,255])
|
|
|
|
int a0 = d0 + d3; // 10b [-510,510]
|
|
|
|
int a1 = d1 + d2; |
|
|
|
int a2 = d1 - d2; |
|
|
|
int a3 = d0 - d3; |
|
|
|
tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9; |
|
|
|
tmp[2 + (i * 4)] = (a0 - a1) * 8; |
|
|
|
tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542]
|
|
|
|
tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160]
|
|
|
|
|
|
|
|
srcIdx += WebpConstants.Bps; |
|
|
|
refIdx += WebpConstants.Bps; |
|
|
|
} |
|
|
|
|
|
|
|
for (i = 0; i < 4; i++) |
|
|
|
{ |
|
|
|
int t12 = tmp[12 + i]; // 15b
|
|
|
|
int t8 = tmp[8 + i]; |
|
|
|
|
|
|
|
int a1 = tmp[4 + i] + t8; |
|
|
|
int a2 = tmp[4 + i] - t8; |
|
|
|
int a0 = tmp[0 + i] + t12; // 15b
|
|
|
|
int a3 = tmp[0 + i] - t12; |
|
|
|
|
|
|
|
output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16); |
|
|
|
output[8 + i] = (short)((a0 - a1 + 7) >> 4); |
|
|
|
output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0)); |
|
|
|
output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
public static void FTransformPass1SSE2(Vector128<short> row01, Vector128<short> row23, out Vector128<int> out01, out Vector128<int> out32) |
|
|
|
{ |
|
|
|
// *in01 = 00 01 10 11 02 03 12 13
|
|
|
|
// *in23 = 20 21 30 31 22 23 32 33
|
|
|
|
Vector128<short> shuf01_p = Sse2.ShuffleHigh(row01, MmShuffle2301); |
|
|
|
Vector128<short> shuf32_p = Sse2.ShuffleHigh(row23, MmShuffle2301); |
|
|
|
|
|
|
|
// 00 01 10 11 03 02 13 12
|
|
|
|
// 20 21 30 31 23 22 33 32
|
|
|
|
Vector128<long> s01 = Sse2.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64()); |
|
|
|
Vector128<long> s32 = Sse2.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64()); |
|
|
|
|
|
|
|
// 00 01 10 11 20 21 30 31
|
|
|
|
// 03 02 13 12 23 22 33 32
|
|
|
|
Vector128<short> a01 = Sse2.Add(s01.AsInt16(), s32.AsInt16()); |
|
|
|
Vector128<short> a32 = Sse2.Subtract(s01.AsInt16(), s32.AsInt16()); |
|
|
|
|
|
|
|
// [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
|
|
|
|
// [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
|
|
|
|
Vector128<int> tmp0 = Sse2.MultiplyAddAdjacent(a01, K88p); // [ (a0 + a1) << 3, ... ]
|
|
|
|
Vector128<int> tmp2 = Sse2.MultiplyAddAdjacent(a01, K88m); // [ (a0 - a1) << 3, ... ]
|
|
|
|
Vector128<int> tmp11 = Sse2.MultiplyAddAdjacent(a32, K5352_2217p); |
|
|
|
Vector128<int> tmp31 = Sse2.MultiplyAddAdjacent(a32, K5352_2217m); |
|
|
|
Vector128<int> tmp12 = Sse2.Add(tmp11, K1812); |
|
|
|
Vector128<int> tmp32 = Sse2.Add(tmp31, K937); |
|
|
|
Vector128<int> tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9); |
|
|
|
Vector128<int> tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9); |
|
|
|
Vector128<short> s03 = Sse2.PackSignedSaturate(tmp0, tmp2); |
|
|
|
Vector128<short> s12 = Sse2.PackSignedSaturate(tmp1, tmp3); |
|
|
|
Vector128<short> slo = Sse2.UnpackLow(s03, s12); // 0 1 0 1 0 1...
|
|
|
|
Vector128<short> shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3
|
|
|
|
Vector128<int> v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32()); |
|
|
|
out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32()); |
|
|
|
out32 = Sse2.Shuffle(v23, MmShuffle1032); |
|
|
|
} |
|
|
|
|
|
|
|
public static void FTransformPass2SSE2(Vector128<int> v01, Vector128<int> v32, Span<short> output) |
|
|
|
{ |
|
|
|
// Same operations are done on the (0,3) and (1,2) pairs.
|
|
|
|
// a3 = v0 - v3
|
|
|
|
// a2 = v1 - v2
|
|
|
|
Vector128<short> a32 = Sse2.Subtract(v01.AsInt16(), v32.AsInt16()); |
|
|
|
Vector128<long> a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64()); |
|
|
|
|
|
|
|
Vector128<short> b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16()); |
|
|
|
Vector128<int> c1 = Sse2.MultiplyAddAdjacent(b23, K5352_2217); |
|
|
|
Vector128<int> c3 = Sse2.MultiplyAddAdjacent(b23, K2217_5352); |
|
|
|
Vector128<int> d1 = Sse2.Add(c1, K12000PlusOne); |
|
|
|
Vector128<int> d3 = Sse2.Add(c3, K51000); |
|
|
|
Vector128<int> e1 = Sse2.ShiftRightArithmetic(d1, 16); |
|
|
|
Vector128<int> e3 = Sse2.ShiftRightArithmetic(d3, 16); |
|
|
|
|
|
|
|
// f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
|
|
|
|
// f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
|
|
|
|
Vector128<short> f1 = Sse2.PackSignedSaturate(e1, e1); |
|
|
|
Vector128<short> f3 = Sse2.PackSignedSaturate(e3, e3); |
|
|
|
|
|
|
|
// g1 = f1 + (a3 != 0);
|
|
|
|
// The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
|
|
|
|
// desired (0, 1), we add one earlier through k12000_plus_one.
|
|
|
|
// -> g1 = f1 + 1 - (a3 == 0)
|
|
|
|
Vector128<short> g1 = Sse2.Add(f1, Sse2.CompareEqual(a32, Vector128<short>.Zero)); |
|
|
|
|
|
|
|
// a0 = v0 + v3
|
|
|
|
// a1 = v1 + v2
|
|
|
|
Vector128<short> a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16()); |
|
|
|
Vector128<short> a01Plus7 = Sse2.Add(a01.AsInt16(), Seven); |
|
|
|
Vector128<short> a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16(); |
|
|
|
Vector128<short> c0 = Sse2.Add(a01Plus7, a11); |
|
|
|
Vector128<short> c2 = Sse2.Subtract(a01Plus7, a11); |
|
|
|
|
|
|
|
// d0 = (a0 + a1 + 7) >> 4;
|
|
|
|
// d2 = (a0 - a1 + 7) >> 4;
|
|
|
|
Vector128<short> d0 = Sse2.ShiftRightArithmetic(c0, 4); |
|
|
|
Vector128<short> d2 = Sse2.ShiftRightArithmetic(c2, 4); |
|
|
|
|
|
|
|
Vector128<long> d0g1 = Sse2.UnpackLow(d0.AsInt64(), g1.AsInt64()); |
|
|
|
Vector128<long> d2f3 = Sse2.UnpackLow(d2.AsInt64(), f3.AsInt64()); |
|
|
|
|
|
|
|
ref short outputRef = ref MemoryMarshal.GetReference(output); |
|
|
|
Unsafe.As<short, Vector128<short>>(ref outputRef) = d0g1.AsInt16(); |
|
|
|
Unsafe.As<short, Vector128<short>>(ref Unsafe.Add(ref outputRef, 8)) = d2f3.AsInt16(); |
|
|
|
} |
|
|
|
#endif
|
|
|
|
|
|
|
|
public static void FTransformWht(Span<short> input, Span<short> output, Span<int> scratch) |
|
|
|
{ |
|
|
|
Span<int> tmp = scratch.Slice(0, 16); |
|
|
|
@ -427,32 +652,37 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy |
|
|
|
int inputIdx = 0; |
|
|
|
for (i = 0; i < 4; i++) |
|
|
|
{ |
|
|
|
int a0 = input[inputIdx + (0 * 16)] + input[inputIdx + (2 * 16)]; // 13b
|
|
|
|
int a1 = input[inputIdx + (1 * 16)] + input[inputIdx + (3 * 16)]; |
|
|
|
int a2 = input[inputIdx + (1 * 16)] - input[inputIdx + (3 * 16)]; |
|
|
|
int a0 = input[inputIdx + (0 * 16)] + input[inputIdx + (2 * 16)]; // 13b
|
|
|
|
int a3 = input[inputIdx + (0 * 16)] - input[inputIdx + (2 * 16)]; |
|
|
|
tmp[0 + (i * 4)] = a0 + a1; // 14b
|
|
|
|
tmp[1 + (i * 4)] = a3 + a2; |
|
|
|
tmp[2 + (i * 4)] = a3 - a2; |
|
|
|
tmp[3 + (i * 4)] = a0 - a1; |
|
|
|
tmp[2 + (i * 4)] = a3 - a2; |
|
|
|
tmp[1 + (i * 4)] = a3 + a2; |
|
|
|
tmp[0 + (i * 4)] = a0 + a1; // 14b
|
|
|
|
|
|
|
|
inputIdx += 64; |
|
|
|
} |
|
|
|
|
|
|
|
for (i = 0; i < 4; i++) |
|
|
|
{ |
|
|
|
int a0 = tmp[0 + i] + tmp[8 + i]; // 15b
|
|
|
|
int a1 = tmp[4 + i] + tmp[12 + i]; |
|
|
|
int a2 = tmp[4 + i] - tmp[12 + i]; |
|
|
|
int a3 = tmp[0 + i] - tmp[8 + i]; |
|
|
|
int t12 = tmp[12 + i]; |
|
|
|
int t8 = tmp[8 + i]; |
|
|
|
|
|
|
|
int a1 = tmp[4 + i] + t12; |
|
|
|
int a2 = tmp[4 + i] - t12; |
|
|
|
int a0 = tmp[0 + i] + t8; // 15b
|
|
|
|
int a3 = tmp[0 + i] - t8; |
|
|
|
|
|
|
|
int b0 = a0 + a1; // 16b
|
|
|
|
int b1 = a3 + a2; |
|
|
|
int b2 = a3 - a2; |
|
|
|
int b3 = a0 - a1; |
|
|
|
output[0 + i] = (short)(b0 >> 1); // 15b
|
|
|
|
output[4 + i] = (short)(b1 >> 1); |
|
|
|
output[8 + i] = (short)(b2 >> 1); |
|
|
|
|
|
|
|
output[12 + i] = (short)(b3 >> 1); |
|
|
|
output[8 + i] = (short)(b2 >> 1); |
|
|
|
output[4 + i] = (short)(b1 >> 1); |
|
|
|
output[0 + i] = (short)(b0 >> 1); // 15b
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|