diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 3064ccc030..cfac273c49 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -661,28 +661,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. - Vector128 transpose00 = Sse2.UnpackLow(b0, b1); - Vector128 transpose01 = Sse2.UnpackLow(b2, b3); - Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); - Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); - - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); - Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); - - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - Vector128 output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); - Vector128 output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); - Vector128 output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); - Vector128 output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -728,6 +707,43 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Unsafe.As>(ref outputRef) = result.AsInt32(); return sum[3] + sum[2] + sum[1] + sum[0]; } + + // Transpose two 4x4 16b matrices horizontally stored in registers. + public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) + { + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + Vector128 transpose00 = Sse2.UnpackLow(b0, b1); + Vector128 transpose01 = Sse2.UnpackLow(b2, b3); + Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); + Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); + + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); + Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); + + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); + output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); + output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); + output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } #endif public static void TransformTwo(Span src, Span dst, Span scratch) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 0567a0f27d..cb149bec7f 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -4,6 +4,11 @@ using System; using System.Buffers.Binary; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Formats.Webp.Lossy { @@ -60,6 +65,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 }; +#if SUPPORTS_RUNTIME_INTRINSICS + public static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16(); + + public static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16(); +#endif + static Vp8Encoding() { for (int i = -255; i <= 255 + 255; i++) @@ -68,12 +79,199 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } + // Transforms (Paragraph 14.4) + // Does one or two inverse transforms. public static void ITransform(Span reference, Span input, Span dst, bool doTwo, Span scratch) { - ITransformOne(reference, input, dst, scratch); - if (doTwo) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); + // This implementation makes use of 16-bit fixed point versions of two + // multiply constants: + // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 + // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 + // + // To be able to use signed 16-bit integers, we use the following trick to + // have constants within range: + // - Associated constants are obtained by subtracting the 16-bit fixed point + // version of one: + // k = K - (1 << 16) => K = k + (1 << 16) + // K1 = 85267 => k1 = 20091 + // K2 = 35468 => k2 = -30068 + // - The multiplication of a variable by a constant become the sum of the + // variable and the multiplication of that variable by the associated + // constant: + // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x + + // Load and concatenate the transform coefficients (we'll do two inverse + // transforms in parallel). In the case of only one inverse transform, the + // second half of the vectors will just contain random value we'll never + // use nor store. + ref short inputRef = ref MemoryMarshal.GetReference(input); + var in0 = Vector128.Create(Unsafe.As(ref inputRef), 0); + var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 4)), 0); + var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 8)), 0); + var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 12)), 0); + + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + if (doTwo) + { + var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 16)), 0); + var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 20)), 0); + var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); + var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); + + in0 = Sse2.UnpackLow(in0, inb0); + in1 = Sse2.UnpackLow(in1, inb1); + in2 = Sse2.UnpackLow(in2, inb2); + in3 = Sse2.UnpackLow(in3, inb3); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Vertical pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); + Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16()); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16()); + Vector128 c3 = Sse2.Subtract(in1, in3); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3.AsInt16(), c4.AsInt16()); + + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16()); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16()); + Vector128 d3 = Sse2.Add(in1, in3); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3.AsInt16(), d4.AsInt16()); + + // Second pass. + Vector128 tmp0 = Sse2.Add(a, d); + Vector128 tmp1 = Sse2.Add(b, c); + Vector128 tmp2 = Sse2.Subtract(b, c); + Vector128 tmp3 = Sse2.Subtract(a, d); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + + // Horizontal pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + var four = Vector128.Create((short)4); + Vector128 dc = Sse2.Add(t0.AsInt16(), four); + a = Sse2.Add(dc, t2.AsInt16()); + b = Sse2.Subtract(dc, t2.AsInt16()); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); + c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); + c3 = Sse2.Subtract(t1, t3); + c4 = Sse2.Subtract(c1, c2); + c = Sse2.Add(c3.AsInt16(), c4); + + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); + d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); + d3 = Sse2.Add(t1, t3); + d4 = Sse2.Add(d1, d2); + d = Sse2.Add(d3.AsInt16(), d4); + + // Second pass. + tmp0 = Sse2.Add(a, d); + tmp1 = Sse2.Add(b, c); + tmp2 = Sse2.Subtract(b, c); + tmp3 = Sse2.Subtract(a, d); + Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + + // Add inverse transform to 'ref' and store. + // Load the reference(s). + Vector128 ref0 = Vector128.Zero; + Vector128 ref1 = Vector128.Zero; + Vector128 ref2 = Vector128.Zero; + Vector128 ref3 = Vector128.Zero; + ref byte referenceRef = ref MemoryMarshal.GetReference(reference); + if (doTwo) + { + // Load eight bytes/pixels per line. + ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); + ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); + ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); + ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); + } + else + { + // Load four bytes/pixels per line. + ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); + ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); + ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); + ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); + } + + // Convert to 16b. + ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); + ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); + ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); + ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + + // Add the inverse transform(s). + Vector128 ref0InvAdded = Sse2.Add(ref0.AsUInt16(), t0.AsUInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsUInt16(), t1.AsUInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsUInt16(), t2.AsUInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsUInt16(), t3.AsUInt16()); + + // Unsigned saturate to 8b. + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16()); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded.AsInt16(), ref1InvAdded.AsInt16()); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded.AsInt16(), ref2InvAdded.AsInt16()); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded.AsInt16(), ref3InvAdded.AsInt16()); + + // Unsigned saturate to 8b. + if (doTwo) + { + // Store eight bytes/pixels per line. + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + Unsafe.As>(ref outputRef) = ref0; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3; + } + else + { + // Store four bytes/pixels per line. + int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); + int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); + int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); + int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); + + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + Unsafe.As(ref outputRef) = output0; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; + } + } + else +#endif + { + ITransformOne(reference, input, dst, scratch); + if (doTwo) + { + ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); + } } }