|
|
|
@ -1035,7 +1035,7 @@ internal static class LossyUtils |
|
|
|
// Does two transforms.
|
|
|
|
public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scratch) |
|
|
|
{ |
|
|
|
if (Sse2.IsSupported) |
|
|
|
if (Vector128.IsHardwareAccelerated) |
|
|
|
{ |
|
|
|
// This implementation makes use of 16-bit fixed point versions of two
|
|
|
|
// multiply constants:
|
|
|
|
@ -1083,64 +1083,64 @@ internal static class LossyUtils |
|
|
|
|
|
|
|
// Vertical pass and subsequent transpose.
|
|
|
|
// First pass, c and d calculations are longer because of the "trick" multiplications.
|
|
|
|
Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); |
|
|
|
Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); |
|
|
|
Vector128<short> a = in0.AsInt16() + in2.AsInt16(); |
|
|
|
Vector128<short> b = in0.AsInt16() - in2.AsInt16(); |
|
|
|
|
|
|
|
Vector128<short> k1 = Vector128.Create((short)20091); |
|
|
|
Vector128<short> k2 = Vector128.Create((short)-30068); |
|
|
|
|
|
|
|
// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
|
|
|
|
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2); |
|
|
|
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1); |
|
|
|
Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); |
|
|
|
Vector128<short> c4 = Sse2.Subtract(c1, c2); |
|
|
|
Vector128<short> c = Sse2.Add(c3.AsInt16(), c4); |
|
|
|
Vector128<short> c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2); |
|
|
|
Vector128<short> c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1); |
|
|
|
Vector128<short> c3 = in1.AsInt16() - in3.AsInt16(); |
|
|
|
Vector128<short> c4 = c1 - c2; |
|
|
|
Vector128<short> c = c3.AsInt16() + c4; |
|
|
|
|
|
|
|
// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
|
|
|
|
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1); |
|
|
|
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2); |
|
|
|
Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); |
|
|
|
Vector128<short> d4 = Sse2.Add(d1, d2); |
|
|
|
Vector128<short> d = Sse2.Add(d3, d4); |
|
|
|
Vector128<short> d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1); |
|
|
|
Vector128<short> d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2); |
|
|
|
Vector128<short> d3 = in1.AsInt16() + in3.AsInt16(); |
|
|
|
Vector128<short> d4 = d1 + d2; |
|
|
|
Vector128<short> d = d3 + d4; |
|
|
|
|
|
|
|
// Second pass.
|
|
|
|
Vector128<short> tmp0 = Sse2.Add(a.AsInt16(), d); |
|
|
|
Vector128<short> tmp1 = Sse2.Add(b.AsInt16(), c); |
|
|
|
Vector128<short> tmp2 = Sse2.Subtract(b.AsInt16(), c); |
|
|
|
Vector128<short> tmp3 = Sse2.Subtract(a.AsInt16(), d); |
|
|
|
Vector128<short> tmp0 = a.AsInt16() + d; |
|
|
|
Vector128<short> tmp1 = b.AsInt16() + c; |
|
|
|
Vector128<short> tmp2 = b.AsInt16() - c; |
|
|
|
Vector128<short> tmp3 = a.AsInt16() - d; |
|
|
|
|
|
|
|
// Transpose the two 4x4.
|
|
|
|
Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3); |
|
|
|
|
|
|
|
// Horizontal pass and subsequent transpose.
|
|
|
|
// First pass, c and d calculations are longer because of the "trick" multiplications.
|
|
|
|
Vector128<short> dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4)); |
|
|
|
a = Sse2.Add(dc, t2.AsInt16()); |
|
|
|
b = Sse2.Subtract(dc, t2.AsInt16()); |
|
|
|
Vector128<short> dc = t0.AsInt16() + Vector128.Create((short)4); |
|
|
|
a = dc + t2.AsInt16(); |
|
|
|
b = dc - t2.AsInt16(); |
|
|
|
|
|
|
|
// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
|
|
|
|
c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2); |
|
|
|
c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1); |
|
|
|
c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); |
|
|
|
c4 = Sse2.Subtract(c1, c2); |
|
|
|
c = Sse2.Add(c3, c4); |
|
|
|
c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2); |
|
|
|
c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1); |
|
|
|
c3 = t1.AsInt16() - t3.AsInt16(); |
|
|
|
c4 = c1 - c2; |
|
|
|
c = c3 + c4; |
|
|
|
|
|
|
|
// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
|
|
|
|
d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1); |
|
|
|
d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2); |
|
|
|
d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); |
|
|
|
d4 = Sse2.Add(d1, d2); |
|
|
|
d = Sse2.Add(d3, d4); |
|
|
|
d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1); |
|
|
|
d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2); |
|
|
|
d3 = t1.AsInt16() + t3.AsInt16(); |
|
|
|
d4 = d1 + d2; |
|
|
|
d = d3 + d4; |
|
|
|
|
|
|
|
// Second pass.
|
|
|
|
tmp0 = Sse2.Add(a, d); |
|
|
|
tmp1 = Sse2.Add(b, c); |
|
|
|
tmp2 = Sse2.Subtract(b, c); |
|
|
|
tmp3 = Sse2.Subtract(a, d); |
|
|
|
Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); |
|
|
|
Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); |
|
|
|
Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); |
|
|
|
Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); |
|
|
|
tmp0 = a + d; |
|
|
|
tmp1 = b + c; |
|
|
|
tmp2 = b - c; |
|
|
|
tmp3 = a - d; |
|
|
|
Vector128<short> shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3); |
|
|
|
Vector128<short> shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3); |
|
|
|
Vector128<short> shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3); |
|
|
|
Vector128<short> shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3); |
|
|
|
|
|
|
|
// Transpose the two 4x4.
|
|
|
|
Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); |
|
|
|
@ -1155,22 +1155,22 @@ internal static class LossyUtils |
|
|
|
Vector128<byte> dst3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3)), 0).AsByte(); |
|
|
|
|
|
|
|
// Convert to 16b.
|
|
|
|
dst0 = Sse2.UnpackLow(dst0, Vector128<byte>.Zero); |
|
|
|
dst1 = Sse2.UnpackLow(dst1, Vector128<byte>.Zero); |
|
|
|
dst2 = Sse2.UnpackLow(dst2, Vector128<byte>.Zero); |
|
|
|
dst3 = Sse2.UnpackLow(dst3, Vector128<byte>.Zero); |
|
|
|
dst0 = Vector128_.UnpackLow(dst0, Vector128<byte>.Zero); |
|
|
|
dst1 = Vector128_.UnpackLow(dst1, Vector128<byte>.Zero); |
|
|
|
dst2 = Vector128_.UnpackLow(dst2, Vector128<byte>.Zero); |
|
|
|
dst3 = Vector128_.UnpackLow(dst3, Vector128<byte>.Zero); |
|
|
|
|
|
|
|
// Add the inverse transform(s).
|
|
|
|
dst0 = Sse2.Add(dst0.AsInt16(), t0.AsInt16()).AsByte(); |
|
|
|
dst1 = Sse2.Add(dst1.AsInt16(), t1.AsInt16()).AsByte(); |
|
|
|
dst2 = Sse2.Add(dst2.AsInt16(), t2.AsInt16()).AsByte(); |
|
|
|
dst3 = Sse2.Add(dst3.AsInt16(), t3.AsInt16()).AsByte(); |
|
|
|
dst0 = (dst0.AsInt16() + t0.AsInt16()).AsByte(); |
|
|
|
dst1 = (dst1.AsInt16() + t1.AsInt16()).AsByte(); |
|
|
|
dst2 = (dst2.AsInt16() + t2.AsInt16()).AsByte(); |
|
|
|
dst3 = (dst3.AsInt16() + t3.AsInt16()).AsByte(); |
|
|
|
|
|
|
|
// Unsigned saturate to 8b.
|
|
|
|
dst0 = Sse2.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); |
|
|
|
dst1 = Sse2.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); |
|
|
|
dst2 = Sse2.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); |
|
|
|
dst3 = Sse2.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); |
|
|
|
dst0 = Vector128_.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); |
|
|
|
dst1 = Vector128_.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); |
|
|
|
dst2 = Vector128_.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); |
|
|
|
dst3 = Vector128_.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); |
|
|
|
|
|
|
|
// Store the results.
|
|
|
|
// Store eight bytes/pixels per line.
|
|
|
|
|