From 6e8def1cc87808965cd0fc5a6f141161cc02de27 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 00:11:44 +0100 Subject: [PATCH 01/69] Add sse2 version of inverse transform --- .../Formats/Webp/Lossy/LossyUtils.cs | 60 ++++-- .../Formats/Webp/Lossy/Vp8Encoding.cs | 204 +++++++++++++++++- 2 files changed, 239 insertions(+), 25 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 3064ccc030..cfac273c49 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -661,28 +661,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. - Vector128 transpose00 = Sse2.UnpackLow(b0, b1); - Vector128 transpose01 = Sse2.UnpackLow(b2, b3); - Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); - Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); - - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); - Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); - - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - Vector128 output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); - Vector128 output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); - Vector128 output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); - Vector128 output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -728,6 +707,43 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Unsafe.As>(ref outputRef) = result.AsInt32(); return sum[3] + sum[2] + sum[1] + sum[0]; } + + // Transpose two 4x4 16b matrices horizontally stored in registers. + public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) + { + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + Vector128 transpose00 = Sse2.UnpackLow(b0, b1); + Vector128 transpose01 = Sse2.UnpackLow(b2, b3); + Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); + Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); + + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); + Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); + + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); + output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); + output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); + output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } #endif public static void TransformTwo(Span src, Span dst, Span scratch) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 0567a0f27d..cb149bec7f 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -4,6 +4,11 @@ using System; using System.Buffers.Binary; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Formats.Webp.Lossy { @@ -60,6 +65,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 }; +#if SUPPORTS_RUNTIME_INTRINSICS + public static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16(); + + public static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16(); +#endif + static Vp8Encoding() { for (int i = -255; i <= 255 + 255; i++) @@ -68,12 +79,199 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } + // Transforms (Paragraph 14.4) + // Does one or two inverse transforms. public static void ITransform(Span reference, Span input, Span dst, bool doTwo, Span scratch) { - ITransformOne(reference, input, dst, scratch); - if (doTwo) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); + // This implementation makes use of 16-bit fixed point versions of two + // multiply constants: + // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 + // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 + // + // To be able to use signed 16-bit integers, we use the following trick to + // have constants within range: + // - Associated constants are obtained by subtracting the 16-bit fixed point + // version of one: + // k = K - (1 << 16) => K = k + (1 << 16) + // K1 = 85267 => k1 = 20091 + // K2 = 35468 => k2 = -30068 + // - The multiplication of a variable by a constant become the sum of the + // variable and the multiplication of that variable by the associated + // constant: + // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x + + // Load and concatenate the transform coefficients (we'll do two inverse + // transforms in parallel). In the case of only one inverse transform, the + // second half of the vectors will just contain random value we'll never + // use nor store. + ref short inputRef = ref MemoryMarshal.GetReference(input); + var in0 = Vector128.Create(Unsafe.As(ref inputRef), 0); + var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 4)), 0); + var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 8)), 0); + var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 12)), 0); + + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + if (doTwo) + { + var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 16)), 0); + var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 20)), 0); + var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); + var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); + + in0 = Sse2.UnpackLow(in0, inb0); + in1 = Sse2.UnpackLow(in1, inb1); + in2 = Sse2.UnpackLow(in2, inb2); + in3 = Sse2.UnpackLow(in3, inb3); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Vertical pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); + Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16()); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16()); + Vector128 c3 = Sse2.Subtract(in1, in3); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3.AsInt16(), c4.AsInt16()); + + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16()); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16()); + Vector128 d3 = Sse2.Add(in1, in3); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3.AsInt16(), d4.AsInt16()); + + // Second pass. + Vector128 tmp0 = Sse2.Add(a, d); + Vector128 tmp1 = Sse2.Add(b, c); + Vector128 tmp2 = Sse2.Subtract(b, c); + Vector128 tmp3 = Sse2.Subtract(a, d); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + + // Horizontal pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + var four = Vector128.Create((short)4); + Vector128 dc = Sse2.Add(t0.AsInt16(), four); + a = Sse2.Add(dc, t2.AsInt16()); + b = Sse2.Subtract(dc, t2.AsInt16()); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); + c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); + c3 = Sse2.Subtract(t1, t3); + c4 = Sse2.Subtract(c1, c2); + c = Sse2.Add(c3.AsInt16(), c4); + + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); + d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); + d3 = Sse2.Add(t1, t3); + d4 = Sse2.Add(d1, d2); + d = Sse2.Add(d3.AsInt16(), d4); + + // Second pass. + tmp0 = Sse2.Add(a, d); + tmp1 = Sse2.Add(b, c); + tmp2 = Sse2.Subtract(b, c); + tmp3 = Sse2.Subtract(a, d); + Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + + // Add inverse transform to 'ref' and store. + // Load the reference(s). + Vector128 ref0 = Vector128.Zero; + Vector128 ref1 = Vector128.Zero; + Vector128 ref2 = Vector128.Zero; + Vector128 ref3 = Vector128.Zero; + ref byte referenceRef = ref MemoryMarshal.GetReference(reference); + if (doTwo) + { + // Load eight bytes/pixels per line. + ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); + ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); + ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); + ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); + } + else + { + // Load four bytes/pixels per line. + ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); + ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); + ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); + ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); + } + + // Convert to 16b. + ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); + ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); + ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); + ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + + // Add the inverse transform(s). + Vector128 ref0InvAdded = Sse2.Add(ref0.AsUInt16(), t0.AsUInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsUInt16(), t1.AsUInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsUInt16(), t2.AsUInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsUInt16(), t3.AsUInt16()); + + // Unsigned saturate to 8b. + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16()); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded.AsInt16(), ref1InvAdded.AsInt16()); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded.AsInt16(), ref2InvAdded.AsInt16()); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded.AsInt16(), ref3InvAdded.AsInt16()); + + // Unsigned saturate to 8b. + if (doTwo) + { + // Store eight bytes/pixels per line. + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + Unsafe.As>(ref outputRef) = ref0; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3; + } + else + { + // Store four bytes/pixels per line. + int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); + int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); + int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); + int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); + + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + Unsafe.As(ref outputRef) = output0; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; + } + } + else +#endif + { + ITransformOne(reference, input, dst, scratch); + if (doTwo) + { + ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); + } } } From 835ecead49cd0e98b223d4e4cb9b32d11190b8b2 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 13:58:02 +0100 Subject: [PATCH 02/69] Store only eight bytes per line --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index c0f81b49f4..55fa2593c9 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -15,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// /// Methods for encoding a VP8 frame. /// - internal static class Vp8Encoding + internal static unsafe class Vp8Encoding { private const int KC1 = 20091 + (1 << 16); @@ -69,6 +69,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16(); public static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16(); + + public static readonly Vector128 Four = Vector128.Create((short)4); #endif static Vp8Encoding() @@ -85,6 +87,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) + //if (false) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: @@ -165,8 +168,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - var four = Vector128.Create((short)4); - Vector128 dc = Sse2.Add(t0.AsInt16(), four); + Vector128 dc = Sse2.Add(t0.AsInt16(), Four); a = Sse2.Add(dc, t2.AsInt16()); b = Sse2.Subtract(dc, t2.AsInt16()); @@ -243,11 +245,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy if (doTwo) { // Store eight bytes/pixels per line. - ref byte outputRef = ref MemoryMarshal.GetReference(dst); - Unsafe.As>(ref outputRef) = ref0; - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1; - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2; - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3; + // TODO: avoid pinning, if possible. + fixed (byte* dstPtr = dst) + { + Sse2.StoreScalar((long*)dstPtr, ref0.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref0.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref0.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref0.AsInt64()); + } } else { From 5968de8f779c21d46facd7b088ec8ae05ecb4a7b Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 13:58:37 +0100 Subject: [PATCH 03/69] Add sse tests for inverse transform --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 1 - .../Formats/WebP/Vp8EncodingTests.cs | 57 +++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 55fa2593c9..8f8cf7643a 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -87,7 +87,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) - //if (false) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs new file mode 100644 index 0000000000..cd5a24d8cf --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -0,0 +1,57 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System.Linq; +using SixLabors.ImageSharp.Formats.Webp.Lossy; +using SixLabors.ImageSharp.Tests.TestUtilities; +using Xunit; + +namespace SixLabors.ImageSharp.Tests.Formats.WebP +{ + [Trait("Format", "Webp")] + public class Vp8EncodingTests + { + private static void RunInverseTransformTest() + { + // arrange + byte[] reference = + { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129 + }; + short[] input = { 177, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 177, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + byte[] dst = new byte[128]; + byte[] expected = + { + 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + int[] scratch = new int[16]; + + // act + Vp8Encoding.ITransform(reference, input, dst, true, scratch); + + // assert + Assert.True(dst.SequenceEqual(expected)); + } + + [Fact] + public void InverseTransform_Works() => RunInverseTransformTest(); + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void InverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.AllowAll); + + [Fact] + public void InverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); +#endif + } +} From 5c0b598ece1dd8ca63664c93c01591310a98a16c Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 15:03:37 +0100 Subject: [PATCH 04/69] Fix copy paste mistake --- src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 8f8cf7643a..dab466b9a4 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -248,9 +248,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy fixed (byte* dstPtr = dst) { Sse2.StoreScalar((long*)dstPtr, ref0.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref0.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref0.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref0.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref1.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref2.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref3.AsInt64()); } } else From 6039d2a8719a263d9a71a4cffb8b1325d6384947 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 16:57:40 +0100 Subject: [PATCH 05/69] Better test case --- .../Formats/WebP/Vp8EncodingTests.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs index cd5a24d8cf..0534963897 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -24,15 +24,15 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129 }; - short[] input = { 177, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 177, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + short[] input = { 1, 216, -48, 0, 96, -24, -48, 24, 0, -24, 24, 0, 0, 0, 0, 0, 38, -240, -72, -24, 0, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; byte[] dst = new byte[128]; byte[] expected = { - 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 161, 160, 149, 105, 78, 127, 156, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 160, 160, 133, 85, 81, 129, 155, 167, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 156, 147, 109, 76, 85, 130, 153, 163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 152, 128, 87, 83, 88, 132, 152, 159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int[] scratch = new int[16]; From abcbc4c48d6bce5543a45003742f98ccd0b7ef9d Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 17:01:56 +0100 Subject: [PATCH 06/69] Fix issue: vectors need to be short type --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index dab466b9a4..6ec191baaa 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -3,6 +3,7 @@ using System; using System.Buffers.Binary; +using System.Linq; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS @@ -145,14 +146,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16()); Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16()); - Vector128 c3 = Sse2.Subtract(in1, in3); + Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); Vector128 c4 = Sse2.Subtract(c1, c2); Vector128 c = Sse2.Add(c3.AsInt16(), c4.AsInt16()); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16()); Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16()); - Vector128 d3 = Sse2.Add(in1, in3); + Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); Vector128 d4 = Sse2.Add(d1, d2); Vector128 d = Sse2.Add(d3.AsInt16(), d4.AsInt16()); @@ -174,14 +175,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); - c3 = Sse2.Subtract(t1, t3); + c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); c4 = Sse2.Subtract(c1, c2); c = Sse2.Add(c3.AsInt16(), c4); // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); - d3 = Sse2.Add(t1, t3); + d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); d4 = Sse2.Add(d1, d2); d = Sse2.Add(d3.AsInt16(), d4); @@ -229,10 +230,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); // Add the inverse transform(s). - Vector128 ref0InvAdded = Sse2.Add(ref0.AsUInt16(), t0.AsUInt16()); - Vector128 ref1InvAdded = Sse2.Add(ref1.AsUInt16(), t1.AsUInt16()); - Vector128 ref2InvAdded = Sse2.Add(ref2.AsUInt16(), t2.AsUInt16()); - Vector128 ref3InvAdded = Sse2.Add(ref3.AsUInt16(), t3.AsUInt16()); + Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); // Unsigned saturate to 8b. ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16()); From 18ecb065a313601b5d81329f99677bc1357ce8d2 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 17:13:23 +0100 Subject: [PATCH 07/69] Add tests for executing only one transform --- .../Formats/WebP/Vp8EncodingTests.cs | 49 +++++++++++++++++-- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs index 0534963897..c4f8601b14 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -11,7 +11,39 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP [Trait("Format", "Webp")] public class Vp8EncodingTests { - private static void RunInverseTransformTest() + private static void RunOneInverseTransformTest() + { + // arrange + byte[] reference = + { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129 + }; + short[] input = { 1, 216, -48, 0, 96, -24, -48, 24, 0, -24, 24, 0, 0, 0, 0, 0, 38, -240, -72, -24, 0, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + byte[] dst = new byte[128]; + byte[] expected = + { + 161, 160, 149, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 160, 160, 133, 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 156, 147, 109, 76, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 152, 128, 87, 83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 + }; + int[] scratch = new int[16]; + + // act + Vp8Encoding.ITransform(reference, input, dst, false, scratch); + + // assert + Assert.True(dst.SequenceEqual(expected)); + } + + private static void RunTwoInverseTransformTest() { // arrange byte[] reference = @@ -44,14 +76,23 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP } [Fact] - public void InverseTransform_Works() => RunInverseTransformTest(); + public void OneInverseTransform_Works() => RunOneInverseTransformTest(); + + [Fact] + public void TwoInverseTransform_Works() => RunTwoInverseTransformTest(); #if SUPPORTS_RUNTIME_INTRINSICS [Fact] - public void InverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.AllowAll); + public void OneInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.AllowAll); + + [Fact] + public void OneInverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); + + [Fact] + public void TwoInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTwoInverseTransformTest, HwIntrinsics.AllowAll); [Fact] - public void InverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); + public void TwoInverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTwoInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); #endif } } From 6e548b5e5bace5fa4c58529d616ff14438fc89bf Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 17:25:00 +0100 Subject: [PATCH 08/69] Remove unnecessary casts --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 6ec191baaa..70500566f0 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -144,18 +144,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16()); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16()); + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3.AsInt16(), c4.AsInt16()); + Vector128 c = Sse2.Add(c3, c4); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16()); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16()); + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3.AsInt16(), d4.AsInt16()); + Vector128 d = Sse2.Add(d3, d4); // Second pass. Vector128 tmp0 = Sse2.Add(a, d); @@ -177,14 +177,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3.AsInt16(), c4); + c = Sse2.Add(c3, c4); // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3.AsInt16(), d4); + d = Sse2.Add(d3, d4); // Second pass. tmp0 = Sse2.Add(a, d); @@ -236,10 +236,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); // Unsigned saturate to 8b. - ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16()); - ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded.AsInt16(), ref1InvAdded.AsInt16()); - ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded.AsInt16(), ref2InvAdded.AsInt16()); - ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded.AsInt16(), ref3InvAdded.AsInt16()); + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); // Unsigned saturate to 8b. if (doTwo) From a201e8a1427976addf71974adfffc742cf8ab888 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 18:09:38 +0100 Subject: [PATCH 09/69] Avoid pinning --- src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 70500566f0..34a3a5f177 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -242,17 +242,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); // Unsigned saturate to 8b. + ref byte outputRef = ref MemoryMarshal.GetReference(dst); if (doTwo) { // Store eight bytes/pixels per line. - // TODO: avoid pinning, if possible. - fixed (byte* dstPtr = dst) - { - Sse2.StoreScalar((long*)dstPtr, ref0.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref1.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref2.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref3.AsInt64()); - } + Unsafe.As>(ref outputRef) = ref0.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower(); } else { @@ -262,7 +259,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); - ref byte outputRef = ref MemoryMarshal.GetReference(dst); Unsafe.As(ref outputRef) = output0; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; From a2c0900f61fe854ce729a65814a33ec350a7bf5d Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 12 Nov 2021 09:12:30 +0100 Subject: [PATCH 10/69] Avx version of CollectColorBlueTransforms --- .../Formats/Webp/Lossless/LosslessUtils.cs | 2 + .../Formats/Webp/Lossless/PredictorEncoder.cs | 60 ++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index f9b97c6c44..defa65b4be 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -744,6 +744,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return (float)retVal; } + [MethodImpl(InliningOptions.ShortMethod)] public static byte TransformColorRed(sbyte greenToRed, uint argb) { sbyte green = U32ToS8(argb >> 8); @@ -752,6 +753,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return (byte)(newRed & 0xff); } + [MethodImpl(InliningOptions.ShortMethod)] public static byte TransformColorBlue(sbyte greenToBlue, sbyte redToBlue, uint argb) { sbyte green = U32ToS8(argb >> 8); diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs index 99504dd488..3d4696d8dd 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs @@ -48,6 +48,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static readonly Vector128 CollectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255); private static readonly Vector128 CollectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); + + private static readonly Vector256 CollectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); + + private static readonly Vector256 CollectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 18, 255, 22, 255, 26, 255, 30, 255); + + private static readonly Vector256 CollectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + + private static readonly Vector256 CollectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + + private static readonly Vector256 CollectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + #endif // This uses C#'s compiler optimization to refer to assembly's static data directly. @@ -1128,7 +1139,54 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse41.IsSupported) + if (Avx2.IsSupported && tileWidth > 16) + { + const int span = 16; + Span values = stackalloc ushort[span]; + var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); + var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); + for (int y = 0; y < tileHeight; y++) + { + Span srcSpan = bgra.Slice(y * stride); +#pragma warning disable SA1503 // Braces should not be omitted + fixed (uint* src = srcSpan) + fixed (ushort* dst = values) + { + for (int x = 0; x + span <= tileWidth; x += span) + { + uint* input0Idx = src + x; + uint* input1Idx = src + x + (span / 2); + Vector256 input0 = Avx.LoadVector256(input0Idx).AsByte(); + Vector256 input1 = Avx.LoadVector256(input1Idx).AsByte(); + Vector256 r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256); + Vector256 r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256); + Vector256 r = Avx2.Or(r0, r1); + Vector256 gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256); + Vector256 gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256); + Vector256 gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector256 g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256); + Vector256 a = Avx2.MultiplyHigh(r.AsInt16(), multsr); + Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); + Vector256 c = Avx2.Subtract(gb.AsByte(), b.AsByte()); + Vector256 d = Avx2.Subtract(c, a.AsByte()); + Vector256 e = Avx2.And(d, CollectColorBlueTransformsBlueMask256); + Avx.Store(dst, e.AsUInt16()); + for (int i = 0; i < span; i++) + { + ++histo[values[i]]; + } + } + } +#pragma warning restore SA1503 // Braces should not be omitted + + int leftOver = tileWidth & (span - 1); + if (leftOver > 0) + { + CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); + } + } + } + else if (Sse41.IsSupported) { const int span = 8; Span values = stackalloc ushort[span]; From c15e62ce5056cd9081bee9a3e9549ab1d3fe261b Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 12 Nov 2021 09:42:34 +0100 Subject: [PATCH 11/69] Avoid pinning --- .../Formats/Webp/Lossless/PredictorEncoder.cs | 144 +++++++++--------- 1 file changed, 69 insertions(+), 75 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs index 3d4696d8dd..d11102c404 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs @@ -1079,34 +1079,32 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless for (int y = 0; y < tileHeight; y++) { Span srcSpan = bgra.Slice(y * stride); -#pragma warning disable SA1503 // Braces should not be omitted - fixed (uint* src = srcSpan) - fixed (ushort* dst = values) + ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); + for (int x = 0; x + span <= tileWidth; x += span) { - for (int x = 0; x + span <= tileWidth; x += span) + int input0Idx = x; + int input1Idx = x + (span / 2); + Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); + Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); + Vector128 g0 = Sse2.And(input0, CollectColorRedTransformsGreenMask); // 0 0 | g 0 + Vector128 g1 = Sse2.And(input1, CollectColorRedTransformsGreenMask); + Vector128 g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 + Vector128 a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r + Vector128 a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16); + Vector128 a = Sse41.PackUnsignedSaturate(a0, a1); // x r + Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr + Vector128 c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r' + Vector128 d = Sse2.And(c, CollectColorRedTransformsAndMask); // 0 r' + + ref ushort outputRef = ref MemoryMarshal.GetReference(values); + Unsafe.As>(ref outputRef) = d.AsUInt16(); + + for (int i = 0; i < span; i++) { - uint* input0Idx = src + x; - uint* input1Idx = src + x + (span / 2); - Vector128 input0 = Sse2.LoadVector128((ushort*)input0Idx).AsByte(); - Vector128 input1 = Sse2.LoadVector128((ushort*)input1Idx).AsByte(); - Vector128 g0 = Sse2.And(input0, CollectColorRedTransformsGreenMask); // 0 0 | g 0 - Vector128 g1 = Sse2.And(input1, CollectColorRedTransformsGreenMask); - Vector128 g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 - Vector128 a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r - Vector128 a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16); - Vector128 a = Sse41.PackUnsignedSaturate(a0, a1); // x r - Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr - Vector128 c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r' - Vector128 d = Sse2.And(c, CollectColorRedTransformsAndMask); // 0 r' - Sse2.Store(dst, d.AsUInt16()); - for (int i = 0; i < span; i++) - { - ++histo[values[i]]; - } + ++histo[values[i]]; } } } -#pragma warning restore SA1503 // Braces should not be omitted int leftOver = tileWidth & (span - 1); if (leftOver > 0) @@ -1148,36 +1146,34 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless for (int y = 0; y < tileHeight; y++) { Span srcSpan = bgra.Slice(y * stride); -#pragma warning disable SA1503 // Braces should not be omitted - fixed (uint* src = srcSpan) - fixed (ushort* dst = values) + ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); + for (int x = 0; x + span <= tileWidth; x += span) { - for (int x = 0; x + span <= tileWidth; x += span) + int input0Idx = x; + int input1Idx = x + (span / 2); + Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); + Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); + Vector256 r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256); + Vector256 r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256); + Vector256 r = Avx2.Or(r0, r1); + Vector256 gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256); + Vector256 gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256); + Vector256 gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector256 g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256); + Vector256 a = Avx2.MultiplyHigh(r.AsInt16(), multsr); + Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); + Vector256 c = Avx2.Subtract(gb.AsByte(), b.AsByte()); + Vector256 d = Avx2.Subtract(c, a.AsByte()); + Vector256 e = Avx2.And(d, CollectColorBlueTransformsBlueMask256); + + ref ushort outputRef = ref MemoryMarshal.GetReference(values); + Unsafe.As>(ref outputRef) = e.AsUInt16(); + + for (int i = 0; i < span; i++) { - uint* input0Idx = src + x; - uint* input1Idx = src + x + (span / 2); - Vector256 input0 = Avx.LoadVector256(input0Idx).AsByte(); - Vector256 input1 = Avx.LoadVector256(input1Idx).AsByte(); - Vector256 r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256); - Vector256 r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256); - Vector256 r = Avx2.Or(r0, r1); - Vector256 gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256); - Vector256 gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256); - Vector256 gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); - Vector256 g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256); - Vector256 a = Avx2.MultiplyHigh(r.AsInt16(), multsr); - Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); - Vector256 c = Avx2.Subtract(gb.AsByte(), b.AsByte()); - Vector256 d = Avx2.Subtract(c, a.AsByte()); - Vector256 e = Avx2.And(d, CollectColorBlueTransformsBlueMask256); - Avx.Store(dst, e.AsUInt16()); - for (int i = 0; i < span; i++) - { - ++histo[values[i]]; - } + ++histo[values[i]]; } } -#pragma warning restore SA1503 // Braces should not be omitted int leftOver = tileWidth & (span - 1); if (leftOver > 0) @@ -1195,37 +1191,35 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless for (int y = 0; y < tileHeight; y++) { Span srcSpan = bgra.Slice(y * stride); -#pragma warning disable SA1503 // Braces should not be omitted - fixed (uint* src = srcSpan) - fixed (ushort* dst = values) + ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); + for (int x = 0; x + span <= tileWidth; x += span) { - for (int x = 0; x + span <= tileWidth; x += span) + int input0Idx = x; + int input1Idx = x + (span / 2); + Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); + Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); + Vector128 r0 = Ssse3.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask); + Vector128 r1 = Ssse3.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask); + Vector128 r = Sse2.Or(r0, r1); + Vector128 gb0 = Sse2.And(input0, CollectColorBlueTransformsGreenBlueMask); + Vector128 gb1 = Sse2.And(input1, CollectColorBlueTransformsGreenBlueMask); + Vector128 gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector128 g = Sse2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask); + Vector128 a = Sse2.MultiplyHigh(r.AsInt16(), multsr); + Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); + Vector128 c = Sse2.Subtract(gb.AsByte(), b.AsByte()); + Vector128 d = Sse2.Subtract(c, a.AsByte()); + Vector128 e = Sse2.And(d, CollectColorBlueTransformsBlueMask); + + ref ushort outputRef = ref MemoryMarshal.GetReference(values); + Unsafe.As>(ref outputRef) = e.AsUInt16(); + + for (int i = 0; i < span; i++) { - uint* input0Idx = src + x; - uint* input1Idx = src + x + (span / 2); - Vector128 input0 = Sse2.LoadVector128((ushort*)input0Idx).AsByte(); - Vector128 input1 = Sse2.LoadVector128((ushort*)input1Idx).AsByte(); - Vector128 r0 = Ssse3.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask); - Vector128 r1 = Ssse3.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask); - Vector128 r = Sse2.Or(r0, r1); - Vector128 gb0 = Sse2.And(input0, CollectColorBlueTransformsGreenBlueMask); - Vector128 gb1 = Sse2.And(input1, CollectColorBlueTransformsGreenBlueMask); - Vector128 gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); - Vector128 g = Sse2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask); - Vector128 a = Sse2.MultiplyHigh(r.AsInt16(), multsr); - Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); - Vector128 c = Sse2.Subtract(gb.AsByte(), b.AsByte()); - Vector128 d = Sse2.Subtract(c, a.AsByte()); - Vector128 e = Sse2.And(d, CollectColorBlueTransformsBlueMask); - Sse2.Store(dst, e.AsUInt16()); - for (int i = 0; i < span; i++) - { - ++histo[values[i]]; - } + ++histo[values[i]]; } } } -#pragma warning restore SA1503 // Braces should not be omitted int leftOver = tileWidth & (span - 1); if (leftOver > 0) From 8806d6bd24fe58910b68b22d925a3e92c5385e5a Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 12 Nov 2021 10:14:34 +0100 Subject: [PATCH 12/69] Add Avx version of CollectColorRedTransforms --- .../Formats/Webp/Lossless/PredictorEncoder.cs | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs index d11102c404..48c02f0d36 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs @@ -39,6 +39,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static readonly Vector128 CollectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte(); + private static readonly Vector256 CollectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte(); + + private static readonly Vector256 CollectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte(); + private static readonly Vector128 CollectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); private static readonly Vector128 CollectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); @@ -1071,7 +1075,48 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static void CollectColorRedTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse41.IsSupported) + if (Avx2.IsSupported && tileWidth > 16) + { + var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed)); + const int span = 16; + Span values = stackalloc ushort[span]; + for (int y = 0; y < tileHeight; y++) + { + Span srcSpan = bgra.Slice(y * stride); + ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); + for (int x = 0; x + span <= tileWidth; x += span) + { + int input0Idx = x; + int input1Idx = x + (span / 2); + Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); + Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); + Vector256 g0 = Avx2.And(input0, CollectColorRedTransformsGreenMask256); // 0 0 | g 0 + Vector256 g1 = Avx2.And(input1, CollectColorRedTransformsGreenMask256); + Vector256 g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 + Vector256 a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r + Vector256 a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16); + Vector256 a = Avx2.PackUnsignedSaturate(a0, a1); // x r + Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr + Vector256 c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r' + Vector256 d = Avx2.And(c, CollectColorRedTransformsAndMask256); // 0 r' + + ref ushort outputRef = ref MemoryMarshal.GetReference(values); + Unsafe.As>(ref outputRef) = d.AsUInt16(); + + for (int i = 0; i < span; i++) + { + ++histo[values[i]]; + } + } + } + + int leftOver = tileWidth & (span - 1); + if (leftOver > 0) + { + CollectColorRedTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToRed, histo); + } + } + else if (Sse41.IsSupported) { var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed)); const int span = 8; From b7059ae23a72f62ae760f453b21d64fbd63057b0 Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Fri, 12 Nov 2021 12:58:58 +0100 Subject: [PATCH 13/69] Add [MethodImpl(InliningOptions.ShortMethod)] Co-authored-by: Anton Firszov --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index c80fd5817a..b8986f66ff 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -750,6 +750,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } // Transpose two 4x4 16b matrices horizontally stored in registers. + [MethodImpl(InliningOptions.ShortMethod)] public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) { // Transpose the two 4x4. From 544319e9ea8689e6f257c03e7990136bbfaad53e Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 12 Nov 2021 13:18:41 +0100 Subject: [PATCH 14/69] ITransform now always does two transforms --- src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 6 +- .../Formats/Webp/Lossy/Vp8Encoding.cs | 277 ++++++++++++------ .../Formats/WebP/Vp8EncodingTests.cs | 4 +- 3 files changed, 192 insertions(+), 95 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs index 38ed80590d..2fcea8ceea 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs @@ -329,7 +329,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy LossyUtils.TransformWht(dcTmp, tmp, scratch); for (n = 0; n < 16; n += 2) { - Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true, scratch); + Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), scratch); } return nz; @@ -342,7 +342,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Span scratch = it.Scratch3.AsSpan(0, 16); Vp8Encoding.FTransform(src, reference, tmp, scratch); int nz = QuantizeBlock(tmp, levels, ref dqm.Y1); - Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch); + Vp8Encoding.ITransformOne(reference, tmp, yuvOut, scratch); return nz; } @@ -375,7 +375,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy for (n = 0; n < 8; n += 2) { - Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true, scratch); + Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), scratch); } return nz << 16; diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 34a3a5f177..bcecdcd757 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -3,7 +3,6 @@ using System; using System.Buffers.Binary; -using System.Linq; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS @@ -16,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// /// Methods for encoding a VP8 frame. /// - internal static unsafe class Vp8Encoding + internal static class Vp8Encoding { private const int KC1 = 20091 + (1 << 16); @@ -83,8 +82,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } // Transforms (Paragraph 14.4) - // Does one or two inverse transforms. - public static void ITransform(Span reference, Span input, Span dst, bool doTwo, Span scratch) + // Does two inverse transforms. + public static void ITransform(Span reference, Span input, Span dst, Span scratch) { #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) @@ -120,23 +119,20 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // a01 a11 a21 a31 x x x x // a02 a12 a22 a32 x x x x // a03 a13 a23 a33 x x x x - if (doTwo) - { - var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 16)), 0); - var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 20)), 0); - var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); - var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); - - in0 = Sse2.UnpackLow(in0, inb0); - in1 = Sse2.UnpackLow(in1, inb1); - in2 = Sse2.UnpackLow(in2, inb2); - in3 = Sse2.UnpackLow(in3, inb3); - - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 - } + var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 16)), 0); + var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 20)), 0); + var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); + var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); + + in0 = Sse2.UnpackLow(in0, inb0); + in1 = Sse2.UnpackLow(in1, inb1); + in2 = Sse2.UnpackLow(in2, inb2); + in3 = Sse2.UnpackLow(in3, inb3); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. @@ -206,22 +202,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 ref2 = Vector128.Zero; Vector128 ref3 = Vector128.Zero; ref byte referenceRef = ref MemoryMarshal.GetReference(reference); - if (doTwo) - { - // Load eight bytes/pixels per line. - ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); - ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); - ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); - ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); - } - else - { - // Load four bytes/pixels per line. - ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); - ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); - ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); - ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); - } + + // Load eight bytes/pixels per line. + ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); + ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); + ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); + ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); // Convert to 16b. ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); @@ -243,72 +229,183 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Unsigned saturate to 8b. ref byte outputRef = ref MemoryMarshal.GetReference(dst); - if (doTwo) - { - // Store eight bytes/pixels per line. - Unsafe.As>(ref outputRef) = ref0.GetLower(); - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower(); - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower(); - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower(); - } - else - { - // Store four bytes/pixels per line. - int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); - int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); - int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); - int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); - - Unsafe.As(ref outputRef) = output0; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; - } + + // Store eight bytes/pixels per line. + Unsafe.As>(ref outputRef) = ref0.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower(); } else #endif { ITransformOne(reference, input, dst, scratch); - if (doTwo) - { - ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); - } + ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); } } public static void ITransformOne(Span reference, Span input, Span dst, Span scratch) { - int i; - Span tmp = scratch.Slice(0, 16); - for (i = 0; i < 4; i++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - // vertical pass. - int a = input[0] + input[8]; - int b = input[0] - input[8]; - int c = Mul(input[4], KC2) - Mul(input[12], KC1); - int d = Mul(input[4], KC1) + Mul(input[12], KC2); - tmp[0] = a + d; - tmp[1] = b + c; - tmp[2] = b - c; - tmp[3] = a - d; - tmp = tmp.Slice(4); - input = input.Slice(1); - } + // Load and concatenate the transform coefficients (we'll do two inverse + // transforms in parallel). In the case of only one inverse transform, the + // second half of the vectors will just contain random value we'll never + // use nor store. + ref short inputRef = ref MemoryMarshal.GetReference(input); + var in0 = Vector128.Create(Unsafe.As(ref inputRef), 0); + var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 4)), 0); + var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 8)), 0); + var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 12)), 0); - tmp = scratch; - for (i = 0; i < 4; i++) + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + + // Vertical pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); + Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); + Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3, c4); + + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); + Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3, d4); + + // Second pass. + Vector128 tmp0 = Sse2.Add(a, d); + Vector128 tmp1 = Sse2.Add(b, c); + Vector128 tmp2 = Sse2.Subtract(b, c); + Vector128 tmp3 = Sse2.Subtract(a, d); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + + // Horizontal pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + Vector128 dc = Sse2.Add(t0.AsInt16(), Four); + a = Sse2.Add(dc, t2.AsInt16()); + b = Sse2.Subtract(dc, t2.AsInt16()); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); + c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); + c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); + c4 = Sse2.Subtract(c1, c2); + c = Sse2.Add(c3, c4); + + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); + d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); + d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); + d4 = Sse2.Add(d1, d2); + d = Sse2.Add(d3, d4); + + // Second pass. + tmp0 = Sse2.Add(a, d); + tmp1 = Sse2.Add(b, c); + tmp2 = Sse2.Subtract(b, c); + tmp3 = Sse2.Subtract(a, d); + Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + + // Add inverse transform to 'ref' and store. + // Load the reference(s). + Vector128 ref0 = Vector128.Zero; + Vector128 ref1 = Vector128.Zero; + Vector128 ref2 = Vector128.Zero; + Vector128 ref3 = Vector128.Zero; + ref byte referenceRef = ref MemoryMarshal.GetReference(reference); + + // Load four bytes/pixels per line. + ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); + ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); + ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); + ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); + + // Convert to 16b. + ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); + ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); + ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); + ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + + // Add the inverse transform(s). + Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); + + // Unsigned saturate to 8b. + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); + + // Unsigned saturate to 8b. + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + + // Store four bytes/pixels per line. + int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); + int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); + int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); + int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); + + Unsafe.As(ref outputRef) = output0; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; + } + else +#endif { - // horizontal pass. - int dc = tmp[0] + 4; - int a = dc + tmp[8]; - int b = dc - tmp[8]; - int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1); - int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2); - Store(dst, reference, 0, i, a + d); - Store(dst, reference, 1, i, b + c); - Store(dst, reference, 2, i, b - c); - Store(dst, reference, 3, i, a - d); - tmp = tmp.Slice(1); + int i; + Span tmp = scratch.Slice(0, 16); + for (i = 0; i < 4; i++) + { + // vertical pass. + int a = input[0] + input[8]; + int b = input[0] - input[8]; + int c = Mul(input[4], KC2) - Mul(input[12], KC1); + int d = Mul(input[4], KC1) + Mul(input[12], KC2); + tmp[0] = a + d; + tmp[1] = b + c; + tmp[2] = b - c; + tmp[3] = a - d; + tmp = tmp.Slice(4); + input = input.Slice(1); + } + + tmp = scratch; + for (i = 0; i < 4; i++) + { + // horizontal pass. + int dc = tmp[0] + 4; + int a = dc + tmp[8]; + int b = dc - tmp[8]; + int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1); + int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2); + Store(dst, reference, 0, i, a + d); + Store(dst, reference, 1, i, b + c); + Store(dst, reference, 2, i, b - c); + Store(dst, reference, 3, i, a - d); + tmp = tmp.Slice(1); + } } } diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs index c4f8601b14..17c9beb9b7 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -37,7 +37,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP int[] scratch = new int[16]; // act - Vp8Encoding.ITransform(reference, input, dst, false, scratch); + Vp8Encoding.ITransformOne(reference, input, dst, scratch); // assert Assert.True(dst.SequenceEqual(expected)); @@ -69,7 +69,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP int[] scratch = new int[16]; // act - Vp8Encoding.ITransform(reference, input, dst, true, scratch); + Vp8Encoding.ITransform(reference, input, dst, scratch); // assert Assert.True(dst.SequenceEqual(expected)); From 5074ee6204f7c33875ee40988f1dc9bb20211a3b Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 12 Nov 2021 13:33:30 +0100 Subject: [PATCH 15/69] Refactor: extract horizontal and vertical pass into methods --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 161 +++++++----------- 1 file changed, 63 insertions(+), 98 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index bcecdcd757..aa4ab5767b 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -136,61 +136,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); - - // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3, c4); - - // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); - - // Second pass. - Vector128 tmp0 = Sse2.Add(a, d); - Vector128 tmp1 = Sse2.Add(b, c); - Vector128 tmp2 = Sse2.Subtract(b, c); - Vector128 tmp3 = Sse2.Subtract(a, d); + InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Four); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); - - // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); - - // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); - - // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); @@ -266,61 +219,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); - - // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3, c4); - - // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); - - // Second pass. - Vector128 tmp0 = Sse2.Add(a, d); - Vector128 tmp1 = Sse2.Add(b, c); - Vector128 tmp2 = Sse2.Subtract(b, c); - Vector128 tmp3 = Sse2.Subtract(a, d); + InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Four); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); - - // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); - - // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); - - // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); @@ -409,6 +315,65 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } +#if SUPPORTS_RUNTIME_INTRINSICS + private static void InverseTransformVerticalPass(Vector128 in0, Vector128 in2, Vector128 in1, Vector128 in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3) + { + Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); + Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); + Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3, c4); + + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); + Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3, d4); + + // Second pass. + tmp0 = Sse2.Add(a, d); + tmp1 = Sse2.Add(b, c); + tmp2 = Sse2.Subtract(b, c); + tmp3 = Sse2.Subtract(a, d); + } + + private static void InverseTransformHorizontalPass(Vector128 t0, Vector128 t2, Vector128 t1, Vector128 t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3) + { + Vector128 dc = Sse2.Add(t0.AsInt16(), Four); + Vector128 a = Sse2.Add(dc, t2.AsInt16()); + Vector128 b = Sse2.Subtract(dc, t2.AsInt16()); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + Vector128 c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); + Vector128 c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3, c4); + + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + Vector128 d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); + Vector128 d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3, d4); + + // Second pass. + Vector128 tmp0 = Sse2.Add(a, d); + Vector128 tmp1 = Sse2.Add(b, c); + Vector128 tmp2 = Sse2.Subtract(b, c); + Vector128 tmp3 = Sse2.Subtract(a, d); + shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); + shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); + shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); + shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + } +#endif + public static void FTransform2(Span src, Span reference, Span output, Span output2, Span scratch) { FTransform(src, reference, output, scratch); From 0e3eda9840bae44b07eeb3b22f4e8696f01c3f98 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 12 Nov 2021 15:33:02 +0100 Subject: [PATCH 16/69] Add tests with and without avx --- .../Formats/Webp/Lossless/PredictorEncoder.cs | 34 +++++++++---------- .../Formats/WebP/PredictorEncoderTests.cs | 6 ++++ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs index 48c02f0d36..e40045c46c 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs @@ -587,19 +587,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return (byte)lower; } - else - { - // upper is closer to residual than lower. - if (residual <= boundaryResidual && upper > boundaryResidual) - { - // Halve quantization step to avoid crossing boundary. This midpoint is - // on the same side of boundary as residual because midpoint <= residual - // (since upper is closer than lower) and residual is below the boundary. - return (byte)(lower + (quantization >> 1)); - } - return (byte)(upper & 0xff); + // upper is closer to residual than lower. + if (residual <= boundaryResidual && upper > boundaryResidual) + { + // Halve quantization step to avoid crossing boundary. This midpoint is + // on the same side of boundary as residual because midpoint <= residual + // (since upper is closer than lower) and residual is below the boundary. + return (byte)(lower + (quantization >> 1)); } + + return (byte)(upper & 0xff); } /// @@ -1075,7 +1073,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static void CollectColorRedTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported && tileWidth > 16) + if (Avx2.IsSupported && tileWidth >= 16) { var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed)); const int span = 16; @@ -1182,7 +1180,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported && tileWidth > 16) + if (Avx2.IsSupported && tileWidth >= 16) { const int span = 16; Span values = stackalloc ushort[span]; @@ -1219,12 +1217,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless ++histo[values[i]]; } } + } - int leftOver = tileWidth & (span - 1); - if (leftOver > 0) - { - CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); - } + int leftOver = tileWidth & (span - 1); + if (leftOver > 0) + { + CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); } } else if (Sse41.IsSupported) diff --git a/tests/ImageSharp.Tests/Formats/WebP/PredictorEncoderTests.cs b/tests/ImageSharp.Tests/Formats/WebP/PredictorEncoderTests.cs index d78f7e2f2a..98c144a90d 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/PredictorEncoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/PredictorEncoderTests.cs @@ -40,8 +40,13 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Fact] public void ColorSpaceTransform_WithBikeImage_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(ColorSpaceTransform_WithBikeImage_ProducesExpectedData, HwIntrinsics.DisableSSE41); + + [Fact] + public void ColorSpaceTransform_WithBikeImage_WithoutAvx2_Works() + => FeatureTestRunner.RunWithHwIntrinsicsFeature(ColorSpaceTransform_WithBikeImage_ProducesExpectedData, HwIntrinsics.DisableAVX2); #endif + // Test image: Input\Webp\peak.png private static void RunColorSpaceTransformTestWithPeakImage() { // arrange @@ -99,6 +104,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp Assert.Equal(expectedData, transformData); } + // Test image: Input\Png\Bike.png private static void RunColorSpaceTransformTestWithBikeImage() { // arrange From 03c2c229bc5805e91da649bef6670f6bb2fe8a68 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 12 Nov 2021 17:02:11 +0100 Subject: [PATCH 17/69] Fix shuffle high mask --- src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs index e40045c46c..cbde586b79 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs @@ -55,7 +55,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static readonly Vector256 CollectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); - private static readonly Vector256 CollectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 18, 255, 22, 255, 26, 255, 30, 255); + private static readonly Vector256 CollectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 18, 255, 22, 255, 26, 255, 30); private static readonly Vector256 CollectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); From bdd728e4d331aa89124d113051e4483d040d6ca2 Mon Sep 17 00:00:00 2001 From: Berkan Diler Date: Sun, 14 Nov 2021 10:32:59 +0100 Subject: [PATCH 18/69] Revert a de-optimization from #1734 and add a comment --- src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs index d9d42e0614..abe59516fa 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs @@ -288,8 +288,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg /// The number of components to write. private void WriteDefineHuffmanTables(int componentCount) { + // This uses a C#'s compiler optimization that refers to the static data segment of the assembly, + // and doesn't incur any allocation at all. // Table identifiers. - ReadOnlySpan headers = stackalloc byte[] + ReadOnlySpan headers = new byte[] { 0x00, 0x10, From c22919d55eb02ee1a553e0dd3d8bb7b67b701d21 Mon Sep 17 00:00:00 2001 From: Berkan Diler Date: Sun, 14 Nov 2021 10:58:46 +0100 Subject: [PATCH 19/69] Replace Span.Fill(default) calls with Span.Clear() Span.Clear() is more optimized than Span.Fill(default) --- .../Tiff/Compression/Compressors/TiffLzwEncoder.cs | 4 ++-- .../Compression/Decompressors/T6TiffCompression.cs | 12 ++++++++++-- src/ImageSharp/Formats/Webp/Lossless/CostModel.cs | 2 +- .../Formats/Webp/Lossless/HistogramEncoder.cs | 2 +- src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs | 2 +- .../Formats/Webp/Lossless/Vp8LHistogram.cs | 10 +++++----- src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs | 8 ++++---- src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs | 2 +- 8 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/ImageSharp/Formats/Tiff/Compression/Compressors/TiffLzwEncoder.cs b/src/ImageSharp/Formats/Tiff/Compression/Compressors/TiffLzwEncoder.cs index baeabdbb20..d4d1d1cb65 100644 --- a/src/ImageSharp/Formats/Tiff/Compression/Compressors/TiffLzwEncoder.cs +++ b/src/ImageSharp/Formats/Tiff/Compression/Compressors/TiffLzwEncoder.cs @@ -256,8 +256,8 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Compressors private void ResetTables() { - this.children.GetSpan().Fill(0); - this.siblings.GetSpan().Fill(0); + this.children.GetSpan().Clear(); + this.siblings.GetSpan().Clear(); this.bitsPerCode = MinBits; this.maxCode = MaxValue(this.bitsPerCode); this.nextValidCode = EoiCode + 1; diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/T6TiffCompression.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/T6TiffCompression.cs index e86418741d..972f4d8ff1 100644 --- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/T6TiffCompression.cs +++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/T6TiffCompression.cs @@ -64,7 +64,7 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors uint bitsWritten = 0; for (int y = 0; y < height; y++) { - scanLine.Fill(0); + scanLine.Clear(); Decode2DScanline(bitReader, this.isWhiteZero, referenceScanLine, scanLine); bitsWritten = this.WriteScanLine(buffer, scanLine, bitsWritten); @@ -116,7 +116,15 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors { // If a TIFF reader encounters EOFB before the expected number of lines has been extracted, // it is appropriate to assume that the missing rows consist entirely of white pixels. - scanline.Fill(whiteIsZero ? (byte)0 : (byte)255); + if (whiteIsZero) + { + scanline.Clear(); + } + else + { + scanline.Fill((byte)255); + } + break; } diff --git a/src/ImageSharp/Formats/Webp/Lossless/CostModel.cs b/src/ImageSharp/Formats/Webp/Lossless/CostModel.cs index 7f4d0307bc..bdaf30dc9c 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/CostModel.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/CostModel.cs @@ -87,7 +87,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (nonzeros <= 1) { - output.AsSpan(0, numSymbols).Fill(0); + output.AsSpan(0, numSymbols).Clear(); } else { diff --git a/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs index 5d407d73c1..b52f8eb5d5 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs @@ -287,7 +287,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless // Create a mapping from a cluster id to its minimal version. int clusterMax = 0; - clusterMappingsTmp.AsSpan().Fill(0); + clusterMappingsTmp.AsSpan().Clear(); // Re-map the ids. for (int i = 0; i < symbols.Length; i++) diff --git a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs index 3c81f1a22c..5db01ca1c7 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs @@ -28,7 +28,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless public static void CreateHuffmanTree(uint[] histogram, int treeDepthLimit, bool[] bufRle, HuffmanTree[] huffTree, HuffmanTreeCode huffCode) { int numSymbols = huffCode.NumSymbols; - bufRle.AsSpan().Fill(false); + bufRle.AsSpan().Clear(); OptimizeHuffmanForRle(numSymbols, bufRle, histogram); GenerateOptimalTree(huffTree, histogram, numSymbols, treeDepthLimit, huffCode.CodeLengths); diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs index 8b02015687..bdb53f5c6a 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs @@ -320,7 +320,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } else { - output.Literal.AsSpan(0, literalSize).Fill(0); + output.Literal.AsSpan(0, literalSize).Clear(); } } @@ -343,7 +343,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } else { - output.Red.AsSpan(0, size).Fill(0); + output.Red.AsSpan(0, size).Clear(); } } @@ -366,7 +366,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } else { - output.Blue.AsSpan(0, size).Fill(0); + output.Blue.AsSpan(0, size).Clear(); } } @@ -389,7 +389,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } else { - output.Alpha.AsSpan(0, size).Fill(0); + output.Alpha.AsSpan(0, size).Clear(); } } @@ -412,7 +412,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } else { - output.Distance.AsSpan(0, size).Fill(0); + output.Distance.AsSpan(0, size).Clear(); } } diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs index 6279aef656..fcd61f2c0e 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs @@ -911,7 +911,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy this.LeftNz[8] = 0; - this.LeftDerr.AsSpan().Fill(0); + this.LeftDerr.AsSpan().Clear(); } private void InitTop() @@ -919,14 +919,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int topSize = this.mbw * 16; this.YTop.AsSpan(0, topSize).Fill(127); this.UvTop.AsSpan().Fill(127); - this.Nz.AsSpan().Fill(0); + this.Nz.AsSpan().Clear(); int predsW = (4 * this.mbw) + 1; int predsH = (4 * this.mbh) + 1; int predsSize = predsW * predsH; - this.Preds.AsSpan(predsSize + this.predsWidth, this.mbw).Fill(0); + this.Preds.AsSpan(predsSize + this.predsWidth, this.mbw).Clear(); - this.TopDerr.AsSpan().Fill(0); + this.TopDerr.AsSpan().Clear(); } private int Bit(uint nz, int n) => (nz & (1 << n)) != 0 ? 1 : 0; diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs index 8a4115d216..37e09d0802 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs @@ -546,7 +546,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int predsW = (4 * this.Mbw) + 1; int predsH = (4 * this.Mbh) + 1; int predsSize = predsW * predsH; - this.Preds.AsSpan(predsSize + this.PredsWidth - 4, 4).Fill(0); + this.Preds.AsSpan(predsSize + this.PredsWidth - 4, 4).Clear(); this.Nz[0] = 0; // constant } From 55f04f6323181dd2e96fde512c60c628563b4834 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 15 Nov 2021 11:40:22 +1100 Subject: [PATCH 20/69] Update PredictorEncoder.cs --- src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs index cbde586b79..95c9065b35 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs @@ -53,9 +53,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static readonly Vector128 CollectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); - private static readonly Vector256 CollectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); + private static readonly Vector256 CollectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255); - private static readonly Vector256 CollectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 18, 255, 22, 255, 26, 255, 30); + private static readonly Vector256 CollectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30); private static readonly Vector256 CollectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); From 345e7c640d36dd2e7bfc85a0551664f51353fb67 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 15 Nov 2021 11:34:12 +0100 Subject: [PATCH 21/69] Move color space transform methods into own class --- .../Webp/Lossless/ColorSpaceTransformUtils.cs | 268 ++++++++++++++++++ .../Formats/Webp/Lossless/PredictorEncoder.cs | 262 +---------------- 2 files changed, 270 insertions(+), 260 deletions(-) create mode 100644 src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs new file mode 100644 index 0000000000..4a8488f1b0 --- /dev/null +++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs @@ -0,0 +1,268 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + +namespace SixLabors.ImageSharp.Formats.Webp.Lossless +{ + internal static class ColorSpaceTransformUtils + { +#if SUPPORTS_RUNTIME_INTRINSICS + private static readonly Vector128 CollectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte(); + + private static readonly Vector128 CollectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte(); + + private static readonly Vector256 CollectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte(); + + private static readonly Vector256 CollectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte(); + + private static readonly Vector128 CollectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + private static readonly Vector128 CollectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + + private static readonly Vector128 CollectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + + private static readonly Vector128 CollectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255); + + private static readonly Vector128 CollectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); + + private static readonly Vector256 CollectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255); + + private static readonly Vector256 CollectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30); + + private static readonly Vector256 CollectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + + private static readonly Vector256 CollectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + + private static readonly Vector256 CollectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +#endif + + public static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported && tileWidth >= 16) + { + const int span = 16; + Span values = stackalloc ushort[span]; + var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); + var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); + for (int y = 0; y < tileHeight; y++) + { + Span srcSpan = bgra.Slice(y * stride); + ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); + for (int x = 0; x + span <= tileWidth; x += span) + { + int input0Idx = x; + int input1Idx = x + (span / 2); + Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); + Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); + Vector256 r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256); + Vector256 r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256); + Vector256 r = Avx2.Or(r0, r1); + Vector256 gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256); + Vector256 gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256); + Vector256 gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector256 g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256); + Vector256 a = Avx2.MultiplyHigh(r.AsInt16(), multsr); + Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); + Vector256 c = Avx2.Subtract(gb.AsByte(), b.AsByte()); + Vector256 d = Avx2.Subtract(c, a.AsByte()); + Vector256 e = Avx2.And(d, CollectColorBlueTransformsBlueMask256); + + ref ushort outputRef = ref MemoryMarshal.GetReference(values); + Unsafe.As>(ref outputRef) = e.AsUInt16(); + + for (int i = 0; i < span; i++) + { + ++histo[values[i]]; + } + } + } + + int leftOver = tileWidth & (span - 1); + if (leftOver > 0) + { + CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); + } + } + else if (Sse41.IsSupported) + { + const int span = 8; + Span values = stackalloc ushort[span]; + var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue)); + var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue)); + for (int y = 0; y < tileHeight; y++) + { + Span srcSpan = bgra.Slice(y * stride); + ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); + for (int x = 0; x + span <= tileWidth; x += span) + { + int input0Idx = x; + int input1Idx = x + (span / 2); + Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); + Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); + Vector128 r0 = Ssse3.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask); + Vector128 r1 = Ssse3.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask); + Vector128 r = Sse2.Or(r0, r1); + Vector128 gb0 = Sse2.And(input0, CollectColorBlueTransformsGreenBlueMask); + Vector128 gb1 = Sse2.And(input1, CollectColorBlueTransformsGreenBlueMask); + Vector128 gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); + Vector128 g = Sse2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask); + Vector128 a = Sse2.MultiplyHigh(r.AsInt16(), multsr); + Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); + Vector128 c = Sse2.Subtract(gb.AsByte(), b.AsByte()); + Vector128 d = Sse2.Subtract(c, a.AsByte()); + Vector128 e = Sse2.And(d, CollectColorBlueTransformsBlueMask); + + ref ushort outputRef = ref MemoryMarshal.GetReference(values); + Unsafe.As>(ref outputRef) = e.AsUInt16(); + + for (int i = 0; i < span; i++) + { + ++histo[values[i]]; + } + } + } + + int leftOver = tileWidth & (span - 1); + if (leftOver > 0) + { + CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); + } + } + else +#endif + { + CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo); + } + } + + private static void CollectColorBlueTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) + { + int pos = 0; + while (tileHeight-- > 0) + { + for (int x = 0; x < tileWidth; x++) + { + int idx = LosslessUtils.TransformColorBlue((sbyte)greenToBlue, (sbyte)redToBlue, bgra[pos + x]); + ++histo[idx]; + } + + pos += stride; + } + } + + public static void CollectColorRedTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported && tileWidth >= 16) + { + var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed)); + const int span = 16; + Span values = stackalloc ushort[span]; + for (int y = 0; y < tileHeight; y++) + { + Span srcSpan = bgra.Slice(y * stride); + ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); + for (int x = 0; x + span <= tileWidth; x += span) + { + int input0Idx = x; + int input1Idx = x + (span / 2); + Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); + Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); + Vector256 g0 = Avx2.And(input0, CollectColorRedTransformsGreenMask256); // 0 0 | g 0 + Vector256 g1 = Avx2.And(input1, CollectColorRedTransformsGreenMask256); + Vector256 g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 + Vector256 a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r + Vector256 a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16); + Vector256 a = Avx2.PackUnsignedSaturate(a0, a1); // x r + Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr + Vector256 c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r' + Vector256 d = Avx2.And(c, CollectColorRedTransformsAndMask256); // 0 r' + + ref ushort outputRef = ref MemoryMarshal.GetReference(values); + Unsafe.As>(ref outputRef) = d.AsUInt16(); + + for (int i = 0; i < span; i++) + { + ++histo[values[i]]; + } + } + } + + int leftOver = tileWidth & (span - 1); + if (leftOver > 0) + { + CollectColorRedTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToRed, histo); + } + } + else if (Sse41.IsSupported) + { + var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed)); + const int span = 8; + Span values = stackalloc ushort[span]; + for (int y = 0; y < tileHeight; y++) + { + Span srcSpan = bgra.Slice(y * stride); + ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); + for (int x = 0; x + span <= tileWidth; x += span) + { + int input0Idx = x; + int input1Idx = x + (span / 2); + Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); + Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); + Vector128 g0 = Sse2.And(input0, CollectColorRedTransformsGreenMask); // 0 0 | g 0 + Vector128 g1 = Sse2.And(input1, CollectColorRedTransformsGreenMask); + Vector128 g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 + Vector128 a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r + Vector128 a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16); + Vector128 a = Sse41.PackUnsignedSaturate(a0, a1); // x r + Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr + Vector128 c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r' + Vector128 d = Sse2.And(c, CollectColorRedTransformsAndMask); // 0 r' + + ref ushort outputRef = ref MemoryMarshal.GetReference(values); + Unsafe.As>(ref outputRef) = d.AsUInt16(); + + for (int i = 0; i < span; i++) + { + ++histo[values[i]]; + } + } + } + + int leftOver = tileWidth & (span - 1); + if (leftOver > 0) + { + CollectColorRedTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToRed, histo); + } + } + else +#endif + { + CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo); + } + } + + private static void CollectColorRedTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) + { + int pos = 0; + while (tileHeight-- > 0) + { + for (int x = 0; x < tileWidth; x++) + { + int idx = LosslessUtils.TransformColorRed((sbyte)greenToRed, bgra[pos + x]); + ++histo[idx]; + } + + pos += stride; + } + } + } +} diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs index 95c9065b35..1f7b284e9d 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs @@ -5,11 +5,6 @@ using System; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -#if SUPPORTS_RUNTIME_INTRINSICS -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; -#endif - namespace SixLabors.ImageSharp.Formats.Webp.Lossless { /// @@ -34,37 +29,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private const int PredLowEffort = 11; -#if SUPPORTS_RUNTIME_INTRINSICS - private static readonly Vector128 CollectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte(); - - private static readonly Vector128 CollectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte(); - - private static readonly Vector256 CollectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte(); - - private static readonly Vector256 CollectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte(); - - private static readonly Vector128 CollectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - - private static readonly Vector128 CollectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - - private static readonly Vector128 CollectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); - - private static readonly Vector128 CollectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255); - - private static readonly Vector128 CollectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14); - - private static readonly Vector256 CollectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255); - - private static readonly Vector256 CollectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30); - - private static readonly Vector256 CollectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - - private static readonly Vector256 CollectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); - - private static readonly Vector256 CollectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - -#endif - // This uses C#'s compiler optimization to refer to assembly's static data directly. private static ReadOnlySpan DeltaLut => new sbyte[] { 16, 16, 8, 4, 2, 2, 2 }; @@ -993,7 +957,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless Span histo = scratch.Slice(0, 256); histo.Clear(); - CollectColorRedTransforms(argb, stride, tileWidth, tileHeight, greenToRed, histo); + ColorSpaceTransformUtils.CollectColorRedTransforms(argb, stride, tileWidth, tileHeight, greenToRed, histo); double curDiff = PredictionCostCrossColor(accumulatedRedHisto, histo); if ((byte)greenToRed == prevX.GreenToRed) @@ -1031,7 +995,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless Span histo = scratch.Slice(0, 256); histo.Clear(); - CollectColorBlueTransforms(argb, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo); + ColorSpaceTransformUtils.CollectColorBlueTransforms(argb, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo); double curDiff = PredictionCostCrossColor(accumulatedBlueHisto, histo); if ((byte)greenToBlue == prevX.GreenToBlue) { @@ -1070,228 +1034,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return curDiff; } - private static void CollectColorRedTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported && tileWidth >= 16) - { - var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed)); - const int span = 16; - Span values = stackalloc ushort[span]; - for (int y = 0; y < tileHeight; y++) - { - Span srcSpan = bgra.Slice(y * stride); - ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x + span <= tileWidth; x += span) - { - int input0Idx = x; - int input1Idx = x + (span / 2); - Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); - Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector256 g0 = Avx2.And(input0, CollectColorRedTransformsGreenMask256); // 0 0 | g 0 - Vector256 g1 = Avx2.And(input1, CollectColorRedTransformsGreenMask256); - Vector256 g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 - Vector256 a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r - Vector256 a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16); - Vector256 a = Avx2.PackUnsignedSaturate(a0, a1); // x r - Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr - Vector256 c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r' - Vector256 d = Avx2.And(c, CollectColorRedTransformsAndMask256); // 0 r' - - ref ushort outputRef = ref MemoryMarshal.GetReference(values); - Unsafe.As>(ref outputRef) = d.AsUInt16(); - - for (int i = 0; i < span; i++) - { - ++histo[values[i]]; - } - } - } - - int leftOver = tileWidth & (span - 1); - if (leftOver > 0) - { - CollectColorRedTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToRed, histo); - } - } - else if (Sse41.IsSupported) - { - var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed)); - const int span = 8; - Span values = stackalloc ushort[span]; - for (int y = 0; y < tileHeight; y++) - { - Span srcSpan = bgra.Slice(y * stride); - ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x + span <= tileWidth; x += span) - { - int input0Idx = x; - int input1Idx = x + (span / 2); - Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); - Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector128 g0 = Sse2.And(input0, CollectColorRedTransformsGreenMask); // 0 0 | g 0 - Vector128 g1 = Sse2.And(input1, CollectColorRedTransformsGreenMask); - Vector128 g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0 - Vector128 a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r - Vector128 a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16); - Vector128 a = Sse41.PackUnsignedSaturate(a0, a1); // x r - Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr - Vector128 c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r' - Vector128 d = Sse2.And(c, CollectColorRedTransformsAndMask); // 0 r' - - ref ushort outputRef = ref MemoryMarshal.GetReference(values); - Unsafe.As>(ref outputRef) = d.AsUInt16(); - - for (int i = 0; i < span; i++) - { - ++histo[values[i]]; - } - } - } - - int leftOver = tileWidth & (span - 1); - if (leftOver > 0) - { - CollectColorRedTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToRed, histo); - } - } - else -#endif - { - CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo); - } - } - - private static void CollectColorRedTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo) - { - int pos = 0; - while (tileHeight-- > 0) - { - for (int x = 0; x < tileWidth; x++) - { - int idx = LosslessUtils.TransformColorRed((sbyte)greenToRed, bgra[pos + x]); - ++histo[idx]; - } - - pos += stride; - } - } - - private static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported && tileWidth >= 16) - { - const int span = 16; - Span values = stackalloc ushort[span]; - var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); - var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); - for (int y = 0; y < tileHeight; y++) - { - Span srcSpan = bgra.Slice(y * stride); - ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x + span <= tileWidth; x += span) - { - int input0Idx = x; - int input1Idx = x + (span / 2); - Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); - Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector256 r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256); - Vector256 r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256); - Vector256 r = Avx2.Or(r0, r1); - Vector256 gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256); - Vector256 gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256); - Vector256 gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); - Vector256 g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256); - Vector256 a = Avx2.MultiplyHigh(r.AsInt16(), multsr); - Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); - Vector256 c = Avx2.Subtract(gb.AsByte(), b.AsByte()); - Vector256 d = Avx2.Subtract(c, a.AsByte()); - Vector256 e = Avx2.And(d, CollectColorBlueTransformsBlueMask256); - - ref ushort outputRef = ref MemoryMarshal.GetReference(values); - Unsafe.As>(ref outputRef) = e.AsUInt16(); - - for (int i = 0; i < span; i++) - { - ++histo[values[i]]; - } - } - } - - int leftOver = tileWidth & (span - 1); - if (leftOver > 0) - { - CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); - } - } - else if (Sse41.IsSupported) - { - const int span = 8; - Span values = stackalloc ushort[span]; - var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue)); - var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue)); - for (int y = 0; y < tileHeight; y++) - { - Span srcSpan = bgra.Slice(y * stride); - ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x + span <= tileWidth; x += span) - { - int input0Idx = x; - int input1Idx = x + (span / 2); - Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); - Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); - Vector128 r0 = Ssse3.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask); - Vector128 r1 = Ssse3.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask); - Vector128 r = Sse2.Or(r0, r1); - Vector128 gb0 = Sse2.And(input0, CollectColorBlueTransformsGreenBlueMask); - Vector128 gb1 = Sse2.And(input1, CollectColorBlueTransformsGreenBlueMask); - Vector128 gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); - Vector128 g = Sse2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask); - Vector128 a = Sse2.MultiplyHigh(r.AsInt16(), multsr); - Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); - Vector128 c = Sse2.Subtract(gb.AsByte(), b.AsByte()); - Vector128 d = Sse2.Subtract(c, a.AsByte()); - Vector128 e = Sse2.And(d, CollectColorBlueTransformsBlueMask); - - ref ushort outputRef = ref MemoryMarshal.GetReference(values); - Unsafe.As>(ref outputRef) = e.AsUInt16(); - - for (int i = 0; i < span; i++) - { - ++histo[values[i]]; - } - } - } - - int leftOver = tileWidth & (span - 1); - if (leftOver > 0) - { - CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo); - } - } - else -#endif - { - CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo); - } - } - - private static void CollectColorBlueTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo) - { - int pos = 0; - while (tileHeight-- > 0) - { - for (int x = 0; x < tileWidth; x++) - { - int idx = LosslessUtils.TransformColorBlue((sbyte)greenToBlue, (sbyte)redToBlue, bgra[pos + x]); - ++histo[idx]; - } - - pos += stride; - } - } - private static float PredictionCostSpatialHistogram(int[][] accumulated, int[][] tile) { double retVal = 0.0d; From 4d5af9c4a9f83a365859d7b49200c696facbc9a7 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 15 Nov 2021 13:44:12 +0100 Subject: [PATCH 22/69] Additional tests for color transforms --- .../WebP/ColorSpaceTransformUtilsTests.cs | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs diff --git a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs new file mode 100644 index 0000000000..5306a8c786 --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs @@ -0,0 +1,92 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using SixLabors.ImageSharp.Formats.Webp.Lossless; +using SixLabors.ImageSharp.Tests.TestUtilities; +using Xunit; + +namespace SixLabors.ImageSharp.Tests.Formats.WebP +{ + [Trait("Format", "Webp")] + public class ColorSpaceTransformUtilsTests + { + private static void RunCollectColorBlueTransformsTest() + { + uint[] pixelData = + { + 3074, 256, 256, 256, 0, 65280, 65280, 65280, 256, 256, 0, 256, 0, 65280, 0, 65280, 16711680, 256, + 256, 0, 65024, 0, 256, 256, 0, 65280, 0, 65280, 0, 256, 0, 256 + }; + + int[] expectedOutput = + { + 31, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + + int[] histo = new int[256]; + ColorSpaceTransformUtils.CollectColorBlueTransforms(pixelData, 0, 32, 1, 0, 0, histo); + + Assert.Equal(expectedOutput, histo); + } + + private static void RunCollectColorRedTransformsTest() + { + uint[] pixelData = + { + 3074, 256, 256, 256, 0, 65280, 65280, 65280, 256, 256, 0, 256, 0, 65280, 0, 65280, 16711680, 256, + 256, 0, 65024, 0, 256, 256, 0, 65280, 0, 65280, 0, 256, 0, 256 + }; + + int[] expectedOutput = + { + 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 + }; + + int[] histo = new int[256]; + ColorSpaceTransformUtils.CollectColorRedTransforms(pixelData, 0, 32, 1, 0, histo); + + Assert.Equal(expectedOutput, histo); + } + + [Fact] + public void CollectColorBlueTransforms_Works() => RunCollectColorBlueTransformsTest(); + + [Fact] + public void CollectColorRedTransforms_Works() => RunCollectColorRedTransformsTest(); + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void CollectColorBlueTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.AllowAll); + + [Fact] + public void CollectColorBlueTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41); + + [Fact] + public void CollectColorBlueTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2); + + [Fact] + public void CollectColorRedTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.AllowAll); + + [Fact] + public void CollectColorRedTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41); + + [Fact] + public void CollectColorRedTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2); +#endif + + } +} From c76518b114673cc1e8ff6d6f574978ad2a970b21 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 15 Nov 2021 15:02:51 +0100 Subject: [PATCH 23/69] Add AVX version of TransformColor --- .../Formats/Webp/Lossless/LosslessUtils.cs | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index f9b97c6c44..9a6d974bdd 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -42,8 +42,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static readonly Vector128 TransformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + private static readonly Vector256 TransformColorAlphaGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + private static readonly Vector128 TransformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + private static readonly Vector256 TransformColorRedBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + private static readonly byte TransformColorShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0); private static readonly Vector128 TransformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); @@ -408,7 +412,37 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless public static void TransformColor(Vp8LMultipliers m, Span data, int numPixels) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse2.IsSupported) + if (Avx2.IsSupported && numPixels >= 8) + { + Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); + Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); + fixed (uint* src = data) + { + int idx; + for (idx = 0; idx + 8 <= numPixels; idx += 8) + { + uint* pos = src + idx; + Vector256 input = Avx.LoadVector256(pos); + Vector256 a = Avx2.And(input.AsByte(), TransformColorAlphaGreenMask256); + Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask); + Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask); + Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector256 e = Avx2.ShiftLeftLogical(input.AsInt16(), 8); + Vector256 f = Avx2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); + Vector256 g = Avx2.ShiftRightLogical(f.AsInt32(), 16); + Vector256 h = Avx2.Add(g.AsByte(), d.AsByte()); + Vector256 i = Avx2.And(h, TransformColorRedBlueMask256); + Vector256 output = Avx2.Subtract(input.AsByte(), i); + Avx.Store((byte*)pos, output); + } + + if (idx != numPixels) + { + TransformColorNoneVectorized(m, data.Slice(idx), numPixels - idx); + } + } + } + else if (Sse2.IsSupported) { Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); @@ -1288,6 +1322,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless #if SUPPORTS_RUNTIME_INTRINSICS [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 MkCst16(int hi, int lo) => Vector128.Create((hi << 16) | (lo & 0xffff)); + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector256 MkCst32(int hi, int lo) => Vector256.Create((hi << 16) | (lo & 0xffff)); #endif private static uint Select(uint a, uint b, uint c, Span scratch) From e67ad60e8d2a42d246d785f0ebe91d92b2183aff Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 15 Nov 2021 15:13:56 +0100 Subject: [PATCH 24/69] Add AVX version of TransformColorInverse --- .../Formats/Webp/Lossless/LosslessUtils.cs | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 9a6d974bdd..94ad343c83 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -52,6 +52,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static readonly Vector128 TransformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + private static readonly Vector256 TransformColorInverseAlphaGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + private static readonly byte TransformColorInverseShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0); #endif @@ -505,7 +507,38 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse2.IsSupported) + if (Avx2.IsSupported && pixelData.Length >= 8) + { + Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); + Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); + fixed (uint* src = pixelData) + { + int idx; + for (idx = 0; idx + 8 <= pixelData.Length; idx += 8) + { + uint* pos = src + idx; + Vector256 input = Avx.LoadVector256(pos); + Vector256 a = Avx2.And(input.AsByte(), TransformColorInverseAlphaGreenMask256); + Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask); + Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask); + Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector256 e = Avx2.Add(input.AsByte(), d.AsByte()); + Vector256 f = Avx2.ShiftLeftLogical(e.AsInt16(), 8); + Vector256 g = Avx2.MultiplyHigh(f, multsb2.AsInt16()); + Vector256 h = Avx2.ShiftRightLogical(g.AsInt32(), 8); + Vector256 i = Avx2.Add(h.AsByte(), f.AsByte()); + Vector256 j = Avx2.ShiftRightLogical(i.AsInt16(), 8); + Vector256 output = Avx2.Or(j.AsByte(), a); + Avx.Store((byte*)pos, output); + } + + if (idx != pixelData.Length) + { + TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); + } + } + } + else if (Sse2.IsSupported) { Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); From 8e5645912cd0b711e865fa8c47ffdbe5be4f83de Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 15 Nov 2021 15:19:07 +0100 Subject: [PATCH 25/69] Add AVX tests --- tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs index c70f332ef6..97567ba218 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs @@ -257,11 +257,17 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Fact] public void TransformColor_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorTest, HwIntrinsics.DisableSSE2); + [Fact] + public void TransformColor_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorTest, HwIntrinsics.DisableAVX2); + [Fact] public void TransformColorInverse_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorInverseTest, HwIntrinsics.AllowAll); [Fact] public void TransformColorInverse_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorInverseTest, HwIntrinsics.DisableSSE2); + + [Fact] + public void TransformColorInverse_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorInverseTest, HwIntrinsics.DisableAVX2); #endif } } From b15a021fac71d9643855e4c52e19d955bfd54daa Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 15 Nov 2021 16:14:51 +0100 Subject: [PATCH 26/69] Avoid pinning --- .../Formats/Webp/Lossless/LosslessUtils.cs | 341 ++++++++---------- 1 file changed, 156 insertions(+), 185 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 94ad343c83..c202ad4a8b 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -128,66 +128,57 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (Avx2.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 8 <= numPixels; i += 8) { - int i; - for (i = 0; i + 8 <= numPixels; i += 8) - { - uint* idx = p + i; - Vector256 input = Avx.LoadVector256((ushort*)idx).AsByte(); - Vector256 in0g0g = Avx2.Shuffle(input, AddGreenToBlueAndRedMaskAvx2); - Vector256 output = Avx2.Add(input, in0g0g); - Avx.Store((byte*)idx, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector256 input = Unsafe.As>(ref pos).AsByte(); + Vector256 in0g0g = Avx2.Shuffle(input, AddGreenToBlueAndRedMaskAvx2); + Vector256 output = Avx2.Add(input, in0g0g); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else if (Ssse3.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 4 <= numPixels; i += 4) { - int i; - for (i = 0; i + 4 <= numPixels; i += 4) - { - uint* idx = p + i; - Vector128 input = Sse2.LoadVector128((ushort*)idx).AsByte(); - Vector128 in0g0g = Ssse3.Shuffle(input, AddGreenToBlueAndRedMaskSsse3); - Vector128 output = Sse2.Add(input, in0g0g); - Sse2.Store((byte*)idx, output.AsByte()); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector128 input = Unsafe.As>(ref pos).AsByte(); + Vector128 in0g0g = Ssse3.Shuffle(input, AddGreenToBlueAndRedMaskSsse3); + Vector128 output = Sse2.Add(input, in0g0g); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else if (Sse2.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 4 <= numPixels; i += 4) { - int i; - for (i = 0; i + 4 <= numPixels; i += 4) - { - uint* idx = p + i; - Vector128 input = Sse2.LoadVector128((ushort*)idx); - Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g - Vector128 b = Sse2.ShuffleLow(a, AddGreenToBlueAndRedShuffleMask); - Vector128 c = Sse2.ShuffleHigh(b, AddGreenToBlueAndRedShuffleMask); // 0g0g - Vector128 output = Sse2.Add(input.AsByte(), c.AsByte()); - Sse2.Store((byte*)idx, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector128 input = Unsafe.As>(ref pos).AsByte(); + Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g + Vector128 b = Sse2.ShuffleLow(a, AddGreenToBlueAndRedShuffleMask); + Vector128 c = Sse2.ShuffleHigh(b, AddGreenToBlueAndRedShuffleMask); // 0g0g + Vector128 output = Sse2.Add(input.AsByte(), c.AsByte()); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else @@ -217,66 +208,57 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (Avx2.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 8 <= numPixels; i += 8) { - int i; - for (i = 0; i + 8 <= numPixels; i += 8) - { - uint* idx = p + i; - Vector256 input = Avx.LoadVector256((ushort*)idx).AsByte(); - Vector256 in0g0g = Avx2.Shuffle(input, SubtractGreenFromBlueAndRedMaskAvx2); - Vector256 output = Avx2.Subtract(input, in0g0g); - Avx.Store((byte*)idx, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector256 input = Unsafe.As>(ref pos).AsByte(); + Vector256 in0g0g = Avx2.Shuffle(input, SubtractGreenFromBlueAndRedMaskAvx2); + Vector256 output = Avx2.Subtract(input, in0g0g); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else if (Ssse3.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 4 <= numPixels; i += 4) { - int i; - for (i = 0; i + 4 <= numPixels; i += 4) - { - uint* idx = p + i; - Vector128 input = Sse2.LoadVector128((ushort*)idx).AsByte(); - Vector128 in0g0g = Ssse3.Shuffle(input, SubtractGreenFromBlueAndRedMaskSsse3); - Vector128 output = Sse2.Subtract(input, in0g0g); - Sse2.Store((byte*)idx, output.AsByte()); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector128 input = Unsafe.As>(ref pos).AsByte(); + Vector128 in0g0g = Ssse3.Shuffle(input, SubtractGreenFromBlueAndRedMaskSsse3); + Vector128 output = Sse2.Subtract(input, in0g0g); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else if (Sse2.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 4 <= numPixels; i += 4) { - int i; - for (i = 0; i + 4 <= numPixels; i += 4) - { - uint* idx = p + i; - Vector128 input = Sse2.LoadVector128((ushort*)idx); - Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g - Vector128 b = Sse2.ShuffleLow(a, SubtractGreenFromBlueAndRedShuffleMask); - Vector128 c = Sse2.ShuffleHigh(b, SubtractGreenFromBlueAndRedShuffleMask); // 0g0g - Vector128 output = Sse2.Subtract(input.AsByte(), c.AsByte()); - Sse2.Store((byte*)idx, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector128 input = Unsafe.As>(ref pos).AsByte(); + Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g + Vector128 b = Sse2.ShuffleLow(a, SubtractGreenFromBlueAndRedShuffleMask); + Vector128 c = Sse2.ShuffleHigh(b, SubtractGreenFromBlueAndRedShuffleMask); // 0g0g + Vector128 output = Sse2.Subtract(input.AsByte(), c.AsByte()); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else @@ -409,75 +391,70 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// Color transform keeps the green (G) value as it is, transforms red (R) based on green and transforms blue (B) based on green and then based on red. /// /// The Vp8LMultipliers. - /// The pixel data to transform. + /// The pixel data to transform. /// The number of pixels to process. - public static void TransformColor(Vp8LMultipliers m, Span data, int numPixels) + public static void TransformColor(Vp8LMultipliers m, Span pixelData, int numPixels) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported && numPixels >= 8) { Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); - fixed (uint* src = data) + + int idx; + for (idx = 0; idx + 8 <= numPixels; idx += 8) { - int idx; - for (idx = 0; idx + 8 <= numPixels; idx += 8) - { - uint* pos = src + idx; - Vector256 input = Avx.LoadVector256(pos); - Vector256 a = Avx2.And(input.AsByte(), TransformColorAlphaGreenMask256); - Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask); - Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask); - Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector256 e = Avx2.ShiftLeftLogical(input.AsInt16(), 8); - Vector256 f = Avx2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); - Vector256 g = Avx2.ShiftRightLogical(f.AsInt32(), 16); - Vector256 h = Avx2.Add(g.AsByte(), d.AsByte()); - Vector256 i = Avx2.And(h, TransformColorRedBlueMask256); - Vector256 output = Avx2.Subtract(input.AsByte(), i); - Avx.Store((byte*)pos, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); + Vector256 input = Unsafe.As>(ref pos); + Vector256 a = Avx2.And(input.AsByte(), TransformColorAlphaGreenMask256); + Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask); + Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask); + Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector256 e = Avx2.ShiftLeftLogical(input.AsInt16(), 8); + Vector256 f = Avx2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); + Vector256 g = Avx2.ShiftRightLogical(f.AsInt32(), 16); + Vector256 h = Avx2.Add(g.AsByte(), d.AsByte()); + Vector256 i = Avx2.And(h, TransformColorRedBlueMask256); + Vector256 output = Avx2.Subtract(input.AsByte(), i); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (idx != numPixels) - { - TransformColorNoneVectorized(m, data.Slice(idx), numPixels - idx); - } + if (idx != numPixels) + { + TransformColorNoneVectorized(m, pixelData.Slice(idx), numPixels - idx); } } else if (Sse2.IsSupported) { Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); - fixed (uint* src = data) + int idx; + for (idx = 0; idx + 4 <= numPixels; idx += 4) { - int idx; - for (idx = 0; idx + 4 <= numPixels; idx += 4) - { - uint* pos = src + idx; - Vector128 input = Sse2.LoadVector128(pos); - Vector128 a = Sse2.And(input.AsByte(), TransformColorAlphaGreenMask); - Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask); - Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask); - Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector128 e = Sse2.ShiftLeftLogical(input.AsInt16(), 8); - Vector128 f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); - Vector128 g = Sse2.ShiftRightLogical(f.AsInt32(), 16); - Vector128 h = Sse2.Add(g.AsByte(), d.AsByte()); - Vector128 i = Sse2.And(h, TransformColorRedBlueMask); - Vector128 output = Sse2.Subtract(input.AsByte(), i); - Sse2.Store((byte*)pos, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); + Vector128 input = Unsafe.As>(ref pos); + Vector128 a = Sse2.And(input.AsByte(), TransformColorAlphaGreenMask); + Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask); + Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask); + Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector128 e = Sse2.ShiftLeftLogical(input.AsInt16(), 8); + Vector128 f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); + Vector128 g = Sse2.ShiftRightLogical(f.AsInt32(), 16); + Vector128 h = Sse2.Add(g.AsByte(), d.AsByte()); + Vector128 i = Sse2.And(h, TransformColorRedBlueMask); + Vector128 output = Sse2.Subtract(input.AsByte(), i); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (idx != numPixels) - { - TransformColorNoneVectorized(m, data.Slice(idx), numPixels - idx); - } + if (idx != numPixels) + { + TransformColorNoneVectorized(m, pixelData.Slice(idx), numPixels - idx); } } else #endif { - TransformColorNoneVectorized(m, data, numPixels); + TransformColorNoneVectorized(m, pixelData, numPixels); } } @@ -511,62 +488,57 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); - fixed (uint* src = pixelData) + int idx; + for (idx = 0; idx + 8 <= pixelData.Length; idx += 8) { - int idx; - for (idx = 0; idx + 8 <= pixelData.Length; idx += 8) - { - uint* pos = src + idx; - Vector256 input = Avx.LoadVector256(pos); - Vector256 a = Avx2.And(input.AsByte(), TransformColorInverseAlphaGreenMask256); - Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask); - Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask); - Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector256 e = Avx2.Add(input.AsByte(), d.AsByte()); - Vector256 f = Avx2.ShiftLeftLogical(e.AsInt16(), 8); - Vector256 g = Avx2.MultiplyHigh(f, multsb2.AsInt16()); - Vector256 h = Avx2.ShiftRightLogical(g.AsInt32(), 8); - Vector256 i = Avx2.Add(h.AsByte(), f.AsByte()); - Vector256 j = Avx2.ShiftRightLogical(i.AsInt16(), 8); - Vector256 output = Avx2.Or(j.AsByte(), a); - Avx.Store((byte*)pos, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); + Vector256 input = Unsafe.As>(ref pos); + Vector256 a = Avx2.And(input.AsByte(), TransformColorInverseAlphaGreenMask256); + Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask); + Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask); + Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector256 e = Avx2.Add(input.AsByte(), d.AsByte()); + Vector256 f = Avx2.ShiftLeftLogical(e.AsInt16(), 8); + Vector256 g = Avx2.MultiplyHigh(f, multsb2.AsInt16()); + Vector256 h = Avx2.ShiftRightLogical(g.AsInt32(), 8); + Vector256 i = Avx2.Add(h.AsByte(), f.AsByte()); + Vector256 j = Avx2.ShiftRightLogical(i.AsInt16(), 8); + Vector256 output = Avx2.Or(j.AsByte(), a); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (idx != pixelData.Length) - { - TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); - } + if (idx != pixelData.Length) + { + TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); } } else if (Sse2.IsSupported) { Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); - fixed (uint* src = pixelData) + + int idx; + for (idx = 0; idx + 4 <= pixelData.Length; idx += 4) { - int idx; - for (idx = 0; idx + 4 <= pixelData.Length; idx += 4) - { - uint* pos = src + idx; - Vector128 input = Sse2.LoadVector128(pos); - Vector128 a = Sse2.And(input.AsByte(), TransformColorInverseAlphaGreenMask); - Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask); - Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask); - Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector128 e = Sse2.Add(input.AsByte(), d.AsByte()); - Vector128 f = Sse2.ShiftLeftLogical(e.AsInt16(), 8); - Vector128 g = Sse2.MultiplyHigh(f, multsb2.AsInt16()); - Vector128 h = Sse2.ShiftRightLogical(g.AsInt32(), 8); - Vector128 i = Sse2.Add(h.AsByte(), f.AsByte()); - Vector128 j = Sse2.ShiftRightLogical(i.AsInt16(), 8); - Vector128 output = Sse2.Or(j.AsByte(), a); - Sse2.Store((byte*)pos, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); + Vector128 input = Unsafe.As>(ref pos); + Vector128 a = Sse2.And(input.AsByte(), TransformColorInverseAlphaGreenMask); + Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask); + Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask); + Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector128 e = Sse2.Add(input.AsByte(), d.AsByte()); + Vector128 f = Sse2.ShiftLeftLogical(e.AsInt16(), 8); + Vector128 g = Sse2.MultiplyHigh(f, multsb2.AsInt16()); + Vector128 h = Sse2.ShiftRightLogical(g.AsInt32(), 8); + Vector128 i = Sse2.Add(h.AsByte(), f.AsByte()); + Vector128 j = Sse2.ShiftRightLogical(i.AsInt16(), 8); + Vector128 output = Sse2.Or(j.AsByte(), a); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (idx != pixelData.Length) - { - TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); - } + if (idx != pixelData.Length) + { + TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); } } else @@ -885,15 +857,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless int correction = (int)((23 * (origV & (y - 1))) >> 4); return (vF * (WebpLookupTables.Log2Table[v] + logCnt)) + correction; } - else - { - return (float)(Log2Reciprocal * v * Math.Log(v)); - } + + return (float)(Log2Reciprocal * v * Math.Log(v)); } private static float FastLog2Slow(uint v) { Guard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v)); + if (v < ApproxLogWithCorrectionMax) { int logCnt = 0; From 7959d0bd8b6742e24451ca58f1f6febc7b445476 Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Mon, 15 Nov 2021 19:21:33 +0100 Subject: [PATCH 27/69] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Günther Foidl --- .../Formats/Webp/Lossless/ColorSpaceTransformUtils.cs | 8 ++++---- src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs index 4a8488f1b0..87b9afa544 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs @@ -56,7 +56,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { Span srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x + span <= tileWidth; x += span) + for (int x = 0; x <= tileWidth - span; x += span) { int input0Idx = x; int input1Idx = x + (span / 2); @@ -101,7 +101,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { Span srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x + span <= tileWidth; x += span) + for (int x = 0; x <= tileWidth - span; x += span) { int input0Idx = x; int input1Idx = x + (span / 2); @@ -170,7 +170,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { Span srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x + span <= tileWidth; x += span) + for (int x = 0; x <= tileWidth - span; x += span) { int input0Idx = x; int input1Idx = x + (span / 2); @@ -211,7 +211,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { Span srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x + span <= tileWidth; x += span) + for (int x = 0; x <= tileWidth - span; x += span) { int input0Idx = x; int input1Idx = x + (span / 2); diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs index 1f7b284e9d..a1e04c66a5 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs @@ -561,7 +561,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return (byte)(lower + (quantization >> 1)); } - return (byte)(upper & 0xff); + return (byte)upper; } /// From c491cbba36907734da66fd49333571d739ca0bc1 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 15 Nov 2021 19:55:45 +0100 Subject: [PATCH 28/69] Use nint for inner loop x variable --- .../Webp/Lossless/ColorSpaceTransformUtils.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs index 87b9afa544..71f3c5ca9e 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs @@ -56,10 +56,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { Span srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x <= tileWidth - span; x += span) + for (nint x = 0; x <= tileWidth - span; x += span) { - int input0Idx = x; - int input1Idx = x + (span / 2); + nint input0Idx = x; + nint input1Idx = x + (span / 2); Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); Vector256 r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256); @@ -101,10 +101,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { Span srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x <= tileWidth - span; x += span) + for (nint x = 0; x <= tileWidth - span; x += span) { - int input0Idx = x; - int input1Idx = x + (span / 2); + nint input0Idx = x; + nint input1Idx = x + (span / 2); Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); Vector128 r0 = Ssse3.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask); @@ -170,10 +170,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { Span srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x <= tileWidth - span; x += span) + for (nint x = 0; x <= tileWidth - span; x += span) { - int input0Idx = x; - int input1Idx = x + (span / 2); + nint input0Idx = x; + nint input1Idx = x + (span / 2); Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); Vector256 g0 = Avx2.And(input0, CollectColorRedTransformsGreenMask256); // 0 0 | g 0 @@ -211,10 +211,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { Span srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); - for (int x = 0; x <= tileWidth - span; x += span) + for (nint x = 0; x <= tileWidth - span; x += span) { - int input0Idx = x; - int input1Idx = x + (span / 2); + nint input0Idx = x; + nint input1Idx = x + (span / 2); Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); Vector128 g0 = Sse2.And(input0, CollectColorRedTransformsGreenMask); // 0 0 | g 0 From ff77361e7c8277c5eddd71614dcbd808e22360cf Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Mon, 15 Nov 2021 20:00:25 +0100 Subject: [PATCH 29/69] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Günther Foidl --- .../Formats/Webp/Lossless/LosslessUtils.cs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index c202ad4a8b..5903ba9a29 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -147,7 +147,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { int numPixels = pixelData.Length; int i; - for (i = 0; i + 4 <= numPixels; i += 4) + for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); @@ -165,7 +165,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { int numPixels = pixelData.Length; int i; - for (i = 0; i + 4 <= numPixels; i += 4) + for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); @@ -209,7 +209,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { int numPixels = pixelData.Length; int i; - for (i = 0; i + 8 <= numPixels; i += 8) + for (i = 0; i <= numPixels - 8; i += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); @@ -227,7 +227,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { int numPixels = pixelData.Length; int i; - for (i = 0; i + 4 <= numPixels; i += 4) + for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); @@ -245,7 +245,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { int numPixels = pixelData.Length; int i; - for (i = 0; i + 4 <= numPixels; i += 4) + for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); @@ -402,7 +402,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); int idx; - for (idx = 0; idx + 8 <= numPixels; idx += 8) + for (idx = 0; idx <= numPixels - 8; idx += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector256 input = Unsafe.As>(ref pos); @@ -429,7 +429,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); int idx; - for (idx = 0; idx + 4 <= numPixels; idx += 4) + for (idx = 0; idx <= numPixels - 4; idx += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector128 input = Unsafe.As>(ref pos); @@ -489,7 +489,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); int idx; - for (idx = 0; idx + 8 <= pixelData.Length; idx += 8) + for (idx = 0; idx <= pixelData.Length - 8; idx += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector256 input = Unsafe.As>(ref pos); @@ -518,7 +518,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); int idx; - for (idx = 0; idx + 4 <= pixelData.Length; idx += 4) + for (idx = 0; idx <= pixelData.Length - 4; idx += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector128 input = Unsafe.As>(ref pos); From b53aab44b36a1d9d6c90457c724d3e53d39d90ba Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Mon, 15 Nov 2021 20:03:57 +0100 Subject: [PATCH 30/69] Change loop condition to i <= numPixels - 8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Günther Foidl --- src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 5903ba9a29..ca021ba9d2 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -129,7 +129,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { int numPixels = pixelData.Length; int i; - for (i = 0; i + 8 <= numPixels; i += 8) + for (i = 0; i <= numPixels - 8; i += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); From 00d20b8ee55b0eb01297b36cbb9dee1f697df27c Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 16 Nov 2021 22:06:47 +1100 Subject: [PATCH 31/69] Use nint and rename scalar fallback --- .../Formats/Webp/Lossless/LosslessUtils.cs | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index ca021ba9d2..84b01846ba 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -128,7 +128,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (Avx2.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 8; i += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -140,13 +140,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (i != numPixels) { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); + AddGreenToBlueAndRedScalar(pixelData.Slice((int)i)); } } else if (Ssse3.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -158,13 +158,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (i != numPixels) { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); + AddGreenToBlueAndRedScalar(pixelData.Slice((int)i)); } } else if (Sse2.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -178,17 +178,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (i != numPixels) { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); + AddGreenToBlueAndRedScalar(pixelData.Slice((int)i)); } } else #endif { - AddGreenToBlueAndRedNoneVectorized(pixelData); + AddGreenToBlueAndRedScalar(pixelData); } } - private static void AddGreenToBlueAndRedNoneVectorized(Span pixelData) + private static void AddGreenToBlueAndRedScalar(Span pixelData) { int numPixels = pixelData.Length; for (int i = 0; i < numPixels; i++) @@ -208,7 +208,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (Avx2.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 8; i += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -220,13 +220,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (i != numPixels) { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); + SubtractGreenFromBlueAndRedScalar(pixelData.Slice((int)i)); } } else if (Ssse3.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -238,13 +238,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (i != numPixels) { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); + SubtractGreenFromBlueAndRedScalar(pixelData.Slice((int)i)); } } else if (Sse2.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -258,17 +258,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (i != numPixels) { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); + SubtractGreenFromBlueAndRedScalar(pixelData.Slice((int)i)); } } else #endif { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData); + SubtractGreenFromBlueAndRedScalar(pixelData); } } - private static void SubtractGreenFromBlueAndRedNoneVectorized(Span pixelData) + private static void SubtractGreenFromBlueAndRedScalar(Span pixelData) { int numPixels = pixelData.Length; for (int i = 0; i < numPixels; i++) @@ -401,7 +401,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); - int idx; + nint idx; for (idx = 0; idx <= numPixels - 8; idx += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); @@ -421,14 +421,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (idx != numPixels) { - TransformColorNoneVectorized(m, pixelData.Slice(idx), numPixels - idx); + TransformColorScalar(m, pixelData.Slice((int)idx), numPixels - (int)idx); } } else if (Sse2.IsSupported) { Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); - int idx; + nint idx; for (idx = 0; idx <= numPixels - 4; idx += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); @@ -448,17 +448,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (idx != numPixels) { - TransformColorNoneVectorized(m, pixelData.Slice(idx), numPixels - idx); + TransformColorScalar(m, pixelData.Slice((int)idx), numPixels - (int)idx); } } else #endif { - TransformColorNoneVectorized(m, pixelData, numPixels); + TransformColorScalar(m, pixelData, numPixels); } } - private static void TransformColorNoneVectorized(Vp8LMultipliers m, Span data, int numPixels) + private static void TransformColorScalar(Vp8LMultipliers m, Span data, int numPixels) { for (int i = 0; i < numPixels; i++) { @@ -488,7 +488,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); - int idx; + nint idx; for (idx = 0; idx <= pixelData.Length - 8; idx += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); @@ -509,7 +509,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (idx != pixelData.Length) { - TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); + TransformColorInverseScalar(m, pixelData.Slice((int)idx)); } } else if (Sse2.IsSupported) @@ -517,7 +517,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); - int idx; + nint idx; for (idx = 0; idx <= pixelData.Length - 4; idx += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); @@ -538,17 +538,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (idx != pixelData.Length) { - TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); + TransformColorInverseScalar(m, pixelData.Slice((int)idx)); } } else #endif { - TransformColorInverseNoneVectorized(m, pixelData); + TransformColorInverseScalar(m, pixelData); } } - private static void TransformColorInverseNoneVectorized(Vp8LMultipliers m, Span pixelData) + private static void TransformColorInverseScalar(Vp8LMultipliers m, Span pixelData) { for (int i = 0; i < pixelData.Length; i++) { From 31e6230801a959060900f7fb04e556a8d9c85a68 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Tue, 16 Nov 2021 18:57:35 +0100 Subject: [PATCH 32/69] Delete benchmark.sh --- tests/ImageSharp.Benchmarks/benchmark.sh | 7 ------- 1 file changed, 7 deletions(-) delete mode 100755 tests/ImageSharp.Benchmarks/benchmark.sh diff --git a/tests/ImageSharp.Benchmarks/benchmark.sh b/tests/ImageSharp.Benchmarks/benchmark.sh deleted file mode 100755 index f51a9833aa..0000000000 --- a/tests/ImageSharp.Benchmarks/benchmark.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# Build in release mode -dotnet build -c Release -f netcoreapp2.0 - -# Run benchmarks -dotnet bin/Release/netcoreapp2.0/ImageSharp.Benchmarks.dll \ No newline at end of file From 56890ba07666c74f9ad6b057353a8a776445fed2 Mon Sep 17 00:00:00 2001 From: Turnerj Date: Wed, 17 Nov 2021 23:20:18 +1030 Subject: [PATCH 33/69] Add missing premultiply --- .../Processors/Convolution/Convolution2PassProcessor{TPixel}.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs index 365b2e2dfc..3f4809c115 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs @@ -396,6 +396,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution PixelOperations.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer); + Numerics.Premultiply(sourceBuffer); + ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer); ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceBase, sourceBuffer.Length); ref Vector4 targetStart = ref targetBase; From 088e92cfcf94ee073e1131660b70fa9077aced7a Mon Sep 17 00:00:00 2001 From: Turnerj Date: Thu, 18 Nov 2021 01:03:32 +1030 Subject: [PATCH 34/69] Add reference image for gaussian blur issue --- .../Convolution/Basic1ParameterConvolutionTests.cs | 9 +++++---- .../Convolution/BoxBlurTest/InBox_Rgba32_blur_3.png | 3 +++ .../Convolution/BoxBlurTest/InBox_Rgba32_blur_5.png | 3 +++ .../BoxBlurTest/OnFullImage_Rgba32_blur_3.png | 3 +++ .../BoxBlurTest/OnFullImage_Rgba32_blur_5.png | 3 +++ .../Convolution/GaussianBlurTest/InBox_Rgba32_blur_3.png | 3 +++ .../Convolution/GaussianBlurTest/InBox_Rgba32_blur_5.png | 3 +++ .../GaussianBlurTest/OnFullImage_Rgba32_blur_3.png | 3 +++ .../GaussianBlurTest/OnFullImage_Rgba32_blur_5.png | 3 +++ .../GaussianSharpenTest/InBox_Rgba32_blur_3.png | 3 +++ .../GaussianSharpenTest/InBox_Rgba32_blur_5.png | 3 +++ .../GaussianSharpenTest/OnFullImage_Rgba32_blur_3.png | 3 +++ .../GaussianSharpenTest/OnFullImage_Rgba32_blur_5.png | 3 +++ 13 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_3.png create mode 100644 tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_5.png create mode 100644 tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_3.png create mode 100644 tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_5.png create mode 100644 tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_3.png create mode 100644 tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_5.png create mode 100644 tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_3.png create mode 100644 tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_5.png create mode 100644 tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_3.png create mode 100644 tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_5.png create mode 100644 tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_3.png create mode 100644 tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_5.png diff --git a/tests/ImageSharp.Tests/Processing/Processors/Convolution/Basic1ParameterConvolutionTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Convolution/Basic1ParameterConvolutionTests.cs index 7e0676aab7..8eed3683e8 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Convolution/Basic1ParameterConvolutionTests.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Convolution/Basic1ParameterConvolutionTests.cs @@ -17,10 +17,11 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution public static readonly TheoryData Values = new TheoryData { 3, 5 }; public static readonly string[] InputImages = - { - TestImages.Bmp.Car, - TestImages.Png.CalliphoraPartial - }; + { + TestImages.Bmp.Car, + TestImages.Png.CalliphoraPartial, + TestImages.Png.Blur + }; [Theory] [WithFileCollection(nameof(InputImages), nameof(Values), PixelTypes.Rgba32)] diff --git a/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_3.png b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_3.png new file mode 100644 index 0000000000..dc28cf4f7a --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6dd45683953e7cecbbaaa339b78db1303f9583b8d0988fe1948c6b1b4ba297a +size 121550 diff --git a/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_5.png b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_5.png new file mode 100644 index 0000000000..11d0b6e050 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3867cbbc1d425ceba20dd392de0728ce4de652860491e87434cd33675f56d8e +size 117863 diff --git a/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_3.png b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_3.png new file mode 100644 index 0000000000..df6e2f2048 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:186c35bc159c7125f59b47866021051ff74368b9021dd09ad3c6386b39be3546 +size 80992 diff --git a/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_5.png b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_5.png new file mode 100644 index 0000000000..4bbc0604ca --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d7413d1d7ac69feb1d1f0a61d0d4a8228d3276337446d2c761ce58b0813cf66 +size 67243 diff --git a/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_3.png b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_3.png new file mode 100644 index 0000000000..1e75f17235 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ace61fd7330b5e52b7aa09af937259d200b71fa152bf1ffdc6b891e5b61abfd5 +size 117133 diff --git a/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_5.png b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_5.png new file mode 100644 index 0000000000..8a94424494 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc2f26bda2dec8354d8b77887806012f28f54b8a8f7e39e7e4bcb4d872d29042 +size 114247 diff --git a/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_3.png b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_3.png new file mode 100644 index 0000000000..eb371ba5f8 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aac58316fa795c2683f7cfac34f69ba71501abd78e0d72076cc36c439a8fa7a +size 63680 diff --git a/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_5.png b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_5.png new file mode 100644 index 0000000000..52f4f2bda2 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7bf28351fa51e0e9b0c2fd4b3fc7a30b0b3a8c1ca2dc9dd62ec5fab56e22c10 +size 50451 diff --git a/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_3.png b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_3.png new file mode 100644 index 0000000000..956facf68d --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6eecdf3bf90a2dd9430ce8501ab98f7a25f4f06674673fd6b9ca6a44435d303 +size 239962 diff --git a/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_5.png b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_5.png new file mode 100644 index 0000000000..98c0096af7 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cc3a46595d648a4551f499e1246ccdb63a80f424487fb7306fd3cfd772f5f1e +size 238816 diff --git a/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_3.png b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_3.png new file mode 100644 index 0000000000..936b774f71 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59ca62ae017d8f5a19dbd0f61ded29d936c325553eb3e08fe39f2440d4c941eb +size 356290 diff --git a/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_5.png b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_5.png new file mode 100644 index 0000000000..4b6642daed --- /dev/null +++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:427d325ace605fe9a22702dcd8bff20dff888293def6569c4dc635b56c732565 +size 351992 From c2bf48e9456124bed37f214be98c62074bc053c9 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Fri, 19 Nov 2021 10:53:27 +0100 Subject: [PATCH 35/69] remove HistogramEqualizationOptions.Default --- .../Normalization/HistogramEqualizationExtensions.cs | 2 +- .../Processors/Normalization/HistogramEqualizationOptions.cs | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/ImageSharp/Processing/Extensions/Normalization/HistogramEqualizationExtensions.cs b/src/ImageSharp/Processing/Extensions/Normalization/HistogramEqualizationExtensions.cs index a8ac3376ab..c1046f82d1 100644 --- a/src/ImageSharp/Processing/Extensions/Normalization/HistogramEqualizationExtensions.cs +++ b/src/ImageSharp/Processing/Extensions/Normalization/HistogramEqualizationExtensions.cs @@ -16,7 +16,7 @@ namespace SixLabors.ImageSharp.Processing /// The image this method extends. /// The to allow chaining of operations. public static IImageProcessingContext HistogramEqualization(this IImageProcessingContext source) => - HistogramEqualization(source, HistogramEqualizationOptions.Default); + HistogramEqualization(source, new HistogramEqualizationOptions()); /// /// Equalizes the histogram of an image to increases the contrast. diff --git a/src/ImageSharp/Processing/Processors/Normalization/HistogramEqualizationOptions.cs b/src/ImageSharp/Processing/Processors/Normalization/HistogramEqualizationOptions.cs index 602dc0c4be..1b8723e4fa 100644 --- a/src/ImageSharp/Processing/Processors/Normalization/HistogramEqualizationOptions.cs +++ b/src/ImageSharp/Processing/Processors/Normalization/HistogramEqualizationOptions.cs @@ -8,11 +8,6 @@ namespace SixLabors.ImageSharp.Processing.Processors.Normalization /// public class HistogramEqualizationOptions { - /// - /// Gets the default instance. - /// - public static HistogramEqualizationOptions Default { get; } = new HistogramEqualizationOptions(); - /// /// Gets or sets the histogram equalization method to use. Defaults to global histogram equalization. /// From 83455e7e0af3d07d360ca6201e10e9f60786e584 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Fri, 19 Nov 2021 10:53:44 +0100 Subject: [PATCH 36/69] replace tabs with spaces in LossyUtilsTests --- tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index d176a5933d..69a24843ca 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -38,7 +38,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP int actual = LossyUtils.Vp8_Sse4X4(a, b); Assert.Equal(expected, actual); - } + } private static void RunMean16x4Test() { From 7ee7952a2c4352cfd219bbea5da3eeb75a84827b Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Fri, 19 Nov 2021 11:01:38 +0100 Subject: [PATCH 37/69] make ColorDistanceCache non-readonly --- .../Processors/Quantization/EuclideanPixelMap{TPixel}.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ImageSharp/Processing/Processors/Quantization/EuclideanPixelMap{TPixel}.cs b/src/ImageSharp/Processing/Processors/Quantization/EuclideanPixelMap{TPixel}.cs index b82ce71bbd..cac11b4a8f 100644 --- a/src/ImageSharp/Processing/Processors/Quantization/EuclideanPixelMap{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Quantization/EuclideanPixelMap{TPixel}.cs @@ -22,7 +22,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Quantization where TPixel : unmanaged, IPixel { private Rgba32[] rgbaPalette; - private readonly ColorDistanceCache cache; + private ColorDistanceCache cache; private readonly Configuration configuration; /// From 61bfb9e60cc2eb75f7fa18561c046b9c4bdf1fe5 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Fri, 19 Nov 2021 11:07:56 +0100 Subject: [PATCH 38/69] add comment on ColorDistanceCache member --- .../Processors/Quantization/EuclideanPixelMap{TPixel}.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ImageSharp/Processing/Processors/Quantization/EuclideanPixelMap{TPixel}.cs b/src/ImageSharp/Processing/Processors/Quantization/EuclideanPixelMap{TPixel}.cs index cac11b4a8f..f544893484 100644 --- a/src/ImageSharp/Processing/Processors/Quantization/EuclideanPixelMap{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Quantization/EuclideanPixelMap{TPixel}.cs @@ -22,6 +22,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Quantization where TPixel : unmanaged, IPixel { private Rgba32[] rgbaPalette; + + // Do not make this readonly! Struct value would be always copied on non-readonly method calls. private ColorDistanceCache cache; private readonly Configuration configuration; From 3de317f6d589c01d6332584c79dd2bc07b14374a Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sat, 20 Nov 2021 16:56:08 +0100 Subject: [PATCH 39/69] Change hashchain to use the memoryAllocator --- .../Webp/Lossless/BackwardReferenceEncoder.cs | 15 +++-- .../Formats/Webp/Lossless/Vp8LEncoder.cs | 9 ++- .../Formats/Webp/Lossless/Vp8LHashChain.cs | 59 ++++++++++++++----- 3 files changed, 58 insertions(+), 25 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs index dc546f8ac2..93f6372c64 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs @@ -3,10 +3,11 @@ using System; using System.Collections.Generic; +using SixLabors.ImageSharp.Memory; namespace SixLabors.ImageSharp.Formats.Webp.Lossless { - internal class BackwardReferenceEncoder + internal static class BackwardReferenceEncoder { /// /// Maximum bit length. @@ -41,6 +42,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless int quality, int lz77TypesToTry, ref int cacheBits, + MemoryAllocator memoryAllocator, Vp8LHashChain hashChain, Vp8LBackwardRefs best, Vp8LBackwardRefs worst) @@ -69,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless BackwardReferencesLz77(width, height, bgra, 0, hashChain, worst); break; case Vp8LLz77Type.Lz77Box: - hashChainBox = new Vp8LHashChain(width * height); + hashChainBox = new Vp8LHashChain(memoryAllocator, width * height); BackwardReferencesLz77Box(width, height, bgra, 0, hashChain, hashChainBox, worst); break; } @@ -617,7 +619,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } } - hashChain.OffsetLength[0] = 0; + Span hashChainOffsetLength = hashChain.OffsetLength.GetSpan(); + hashChainOffsetLength[0] = 0; for (i = 1; i < pixelCount; i++) { int ind; @@ -695,19 +698,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (bestLength <= MinLength) { - hashChain.OffsetLength[i] = 0; + hashChainOffsetLength[i] = 0; bestOffsetPrev = 0; bestLengthPrev = 0; } else { - hashChain.OffsetLength[i] = (uint)((bestOffset << MaxLengthBits) | bestLength); + hashChainOffsetLength[i] = (uint)((bestOffset << MaxLengthBits) | bestLength); bestOffsetPrev = bestOffset; bestLengthPrev = bestLength; } } - hashChain.OffsetLength[0] = 0; + hashChainOffsetLength[0] = 0; BackwardReferencesLz77(xSize, ySize, bgra, cacheBits, hashChain, refs); } diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs index da815a479a..48f7d0e2b7 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs @@ -124,7 +124,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless this.EncodedData = memoryAllocator.Allocate(pixelCount); this.Palette = memoryAllocator.Allocate(WebpConstants.MaxPaletteSize); this.Refs = new Vp8LBackwardRefs[3]; - this.HashChain = new Vp8LHashChain(pixelCount); + this.HashChain = new Vp8LHashChain(memoryAllocator, pixelCount); // We round the block size up, so we're guaranteed to have at most MaxRefsBlockPerImage blocks used: int refsBlockSize = ((pixelCount - 1) / MaxRefsBlockPerImage) + 1; @@ -515,7 +515,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } // Calculate backward references from BGRA image. - this.HashChain.Fill(this.memoryAllocator, bgra, this.quality, width, height, lowEffort); + this.HashChain.Fill(bgra, this.quality, width, height, lowEffort); Vp8LBitWriter bitWriterBest = config.SubConfigs.Count > 1 ? this.bitWriter.Clone() : this.bitWriter; Vp8LBitWriter bwInit = this.bitWriter; @@ -529,6 +529,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless this.quality, subConfig.Lz77, ref cacheBits, + this.memoryAllocator, this.HashChain, this.Refs[0], this.Refs[1]); @@ -735,7 +736,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } // Calculate backward references from the image pixels. - hashChain.Fill(this.memoryAllocator, bgra, quality, width, height, lowEffort); + hashChain.Fill(bgra, quality, width, height, lowEffort); Vp8LBackwardRefs refs = BackwardReferenceEncoder.GetBackwardReferences( width, @@ -744,6 +745,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless quality, (int)Vp8LLz77Type.Lz77Standard | (int)Vp8LLz77Type.Lz77Rle, ref cacheBits, + this.memoryAllocator, hashChain, refsTmp1, refsTmp2); @@ -1802,6 +1804,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless this.BgraScratch.Dispose(); this.Palette.Dispose(); this.TransformData.Dispose(); + this.HashChain.Dispose(); } } } diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs index 977a094bd1..2aa35e392e 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs @@ -8,7 +8,7 @@ using SixLabors.ImageSharp.Memory; namespace SixLabors.ImageSharp.Formats.Webp.Lossless { - internal class Vp8LHashChain + internal class Vp8LHashChain : IDisposable { private const uint HashMultiplierHi = 0xc6a4a793u; @@ -28,14 +28,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// private const int WindowSize = (1 << WindowSizeBits) - 120; + private readonly MemoryAllocator memoryAllocator; + + private bool disposed; + /// /// Initializes a new instance of the class. /// + /// The memory allocator. /// The size off the chain. - public Vp8LHashChain(int size) + public Vp8LHashChain(MemoryAllocator memoryAllocator, int size) { - this.OffsetLength = new uint[size]; - this.OffsetLength.AsSpan().Fill(0xcdcdcdcd); + this.memoryAllocator = memoryAllocator; + this.OffsetLength = this.memoryAllocator.Allocate(size, AllocationOptions.Clean); this.Size = size; } @@ -45,16 +50,16 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// These 20 bits are the limit defined by GetWindowSizeForHashChain (through WindowSize = 1 << 20). /// The lower 12 bits contain the length of the match. /// - public uint[] OffsetLength { get; } + public IMemoryOwner OffsetLength { get; } /// /// Gets the size of the hash chain. - /// This is the maximum size of the hash_chain that can be constructed. + /// This is the maximum size of the hashchain that can be constructed. /// Typically this is the pixel count (width x height) for a given image. /// public int Size { get; } - public void Fill(MemoryAllocator memoryAllocator, ReadOnlySpan bgra, int quality, int xSize, int ySize, bool lowEffort) + public void Fill(ReadOnlySpan bgra, int quality, int xSize, int ySize, bool lowEffort) { int size = xSize * ySize; int iterMax = GetMaxItersForQuality(quality); @@ -63,20 +68,21 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if (size <= 2) { - this.OffsetLength[0] = 0; + this.OffsetLength.GetSpan()[0] = 0; return; } - using IMemoryOwner hashToFirstIndexBuffer = memoryAllocator.Allocate(HashSize); + using IMemoryOwner hashToFirstIndexBuffer = this.memoryAllocator.Allocate(HashSize); + using IMemoryOwner chainBuffer = this.memoryAllocator.Allocate(size, AllocationOptions.Clean); Span hashToFirstIndex = hashToFirstIndexBuffer.GetSpan(); + Span chain = chainBuffer.GetSpan(); // Initialize hashToFirstIndex array to -1. hashToFirstIndex.Fill(-1); - int[] chain = new int[size]; - // Fill the chain linking pixels with the same hash. bool bgraComp = bgra.Length > 1 && bgra[0] == bgra[1]; + Span tmp = stackalloc uint[2]; for (pos = 0; pos < size - 2;) { uint hashCode; @@ -85,7 +91,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { // Consecutive pixels with the same color will share the same hash. // We therefore use a different hash: the color and its repetition length. - uint[] tmp = new uint[2]; + tmp.Clear(); uint len = 1; tmp[0] = bgra[pos]; @@ -134,7 +140,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless // Find the best match interval at each pixel, defined by an offset to the // pixel and a length. The right-most pixel cannot match anything to the right // (hence a best length of 0) and the left-most pixel nothing to the left (hence an offset of 0). - this.OffsetLength[0] = this.OffsetLength[size - 1] = 0; + Span offsetLength = this.OffsetLength.GetSpan(); + offsetLength[0] = offsetLength[size - 1] = 0; for (int basePosition = size - 2; basePosition > 0;) { int maxLen = LosslessUtils.MaxFindCopyLength(size - 1 - basePosition); @@ -208,7 +215,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless uint maxBasePosition = (uint)basePosition; while (true) { - this.OffsetLength[basePosition] = (bestDistance << BackwardReferenceEncoder.MaxLengthBits) | (uint)bestLength; + offsetLength[basePosition] = (bestDistance << BackwardReferenceEncoder.MaxLengthBits) | (uint)bestLength; --basePosition; // Stop if we don't have a match or if we are out of bounds. @@ -242,10 +249,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } [MethodImpl(InliningOptions.ShortMethod)] - public int FindLength(int basePosition) => (int)(this.OffsetLength[basePosition] & ((1U << BackwardReferenceEncoder.MaxLengthBits) - 1)); + public int FindLength(int basePosition) => (int)(this.OffsetLength.GetSpan()[basePosition] & ((1U << BackwardReferenceEncoder.MaxLengthBits) - 1)); [MethodImpl(InliningOptions.ShortMethod)] - public int FindOffset(int basePosition) => (int)(this.OffsetLength[basePosition] >> BackwardReferenceEncoder.MaxLengthBits); + public int FindOffset(int basePosition) => (int)(this.OffsetLength.GetSpan()[basePosition] >> BackwardReferenceEncoder.MaxLengthBits); /// /// Calculates the hash for a pixel pair. @@ -280,5 +287,25 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return maxWindowSize > WindowSize ? WindowSize : maxWindowSize; } + + protected virtual void Dispose(bool disposing) + { + if (!this.disposed) + { + if (disposing) + { + this.OffsetLength.Dispose(); + } + + this.disposed = true; + } + } + + /// + public void Dispose() + { + this.Dispose(disposing: true); + GC.SuppressFinalize(this); + } } } From 40b6f4e55bb12127bc84d33c8885dd21a74fb768 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sat, 20 Nov 2021 17:00:34 +0100 Subject: [PATCH 40/69] Change huffman code to a struct --- .../Formats/Webp/Lossless/HTreeGroup.cs | 4 -- .../Formats/Webp/Lossless/HuffmanCode.cs | 2 +- .../Formats/Webp/Lossless/HuffmanUtils.cs | 45 ++++++++++--------- .../Webp/Lossless/WebpLosslessDecoder.cs | 14 +++--- .../WebP/ColorSpaceTransformUtilsTests.cs | 2 +- .../Formats/WebP/LossyUtilsTests.cs | 2 +- .../Formats/WebP/QuantEncTests.cs | 2 +- .../Formats/WebP/Vp8EncodingTests.cs | 2 +- 8 files changed, 35 insertions(+), 38 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs b/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs index a038248f1a..09ceb0334a 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs @@ -19,10 +19,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { this.HTrees = new List(WebpConstants.HuffmanCodesPerMetaCode); this.PackedTable = new HuffmanCode[packedTableSize]; - for (int i = 0; i < packedTableSize; i++) - { - this.PackedTable[i] = new HuffmanCode(); - } } /// diff --git a/src/ImageSharp/Formats/Webp/Lossless/HuffmanCode.cs b/src/ImageSharp/Formats/Webp/Lossless/HuffmanCode.cs index f75c64de11..efb9283568 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanCode.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanCode.cs @@ -9,7 +9,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// A classic way to do entropy coding where a smaller number of bits are used for more frequent codes. /// [DebuggerDisplay("BitsUsed: {BitsUsed}, Value: {Value}")] - internal class HuffmanCode + internal struct HuffmanCode { /// /// Gets or sets the number of bits used for this symbol. diff --git a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs index 5db01ca1c7..66170b85fd 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs @@ -2,6 +2,7 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Runtime.CompilerServices; namespace SixLabors.ImageSharp.Formats.Webp.Lossless { @@ -307,9 +308,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless public static int BuildHuffmanTable(Span table, int rootBits, int[] codeLengths, int codeLengthsSize) { - Guard.MustBeGreaterThan(rootBits, 0, nameof(rootBits)); - Guard.NotNull(codeLengths, nameof(codeLengths)); - Guard.MustBeGreaterThan(codeLengthsSize, 0, nameof(codeLengthsSize)); + DebugGuard.MustBeGreaterThan(rootBits, 0, nameof(rootBits)); + DebugGuard.NotNull(codeLengths, nameof(codeLengths)); + DebugGuard.MustBeGreaterThan(codeLengthsSize, 0, nameof(codeLengthsSize)); // sorted[codeLengthsSize] is a pre-allocated array for sorting symbols by code length. int[] sorted = new int[codeLengthsSize]; @@ -467,27 +468,27 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless break; } - else if (repetitions < 11) + + if (repetitions < 11) { tokens[pos].Code = 17; tokens[pos].ExtraBits = (byte)(repetitions - 3); pos++; break; } - else if (repetitions < 139) + + if (repetitions < 139) { tokens[pos].Code = 18; tokens[pos].ExtraBits = (byte)(repetitions - 11); pos++; break; } - else - { - tokens[pos].Code = 18; - tokens[pos].ExtraBits = 0x7f; // 138 repeated 0s - pos++; - repetitions -= 138; - } + + tokens[pos].Code = 18; + tokens[pos].ExtraBits = 0x7f; // 138 repeated 0s + pos++; + repetitions -= 138; } return pos; @@ -519,20 +520,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless break; } - else if (repetitions < 7) + + if (repetitions < 7) { tokens[pos].Code = 16; tokens[pos].ExtraBits = (byte)(repetitions - 3); pos++; break; } - else - { - tokens[pos].Code = 16; - tokens[pos].ExtraBits = 3; - pos++; - repetitions -= 6; - } + + tokens[pos].Code = 16; + tokens[pos].ExtraBits = 3; + pos++; + repetitions -= 6; } return pos; @@ -541,7 +541,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// /// Get the actual bit values for a tree of bit depths. /// - /// The hiffman tree. + /// The huffman tree. private static void ConvertBitDepthsToSymbols(HuffmanTreeCode tree) { // 0 bit-depth means that the symbol does not exist. @@ -628,7 +628,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// private static void ReplicateValue(Span table, int step, int end, HuffmanCode code) { - Guard.IsTrue(end % step == 0, nameof(end), "end must be a multiple of step"); + DebugGuard.IsTrue(end % step == 0, nameof(end), "end must be a multiple of step"); do { @@ -656,6 +656,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// /// Heuristics for selecting the stride ranges to collapse. /// + [MethodImpl(InliningOptions.ShortMethod)] private static bool ValuesShouldBeCollapsedToStrideAverage(int a, int b) => Math.Abs(a - b) < 4; } } diff --git a/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs b/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs index 4f7a4eb3d8..82fd13c7d5 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs @@ -834,10 +834,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private void BuildPackedTable(HTreeGroup hTreeGroup) { - for (uint code = 0; code < HuffmanUtils.HuffmanPackedTableSize; ++code) + for (uint code = 0; code < HuffmanUtils.HuffmanPackedTableSize; code++) { uint bits = code; - HuffmanCode huff = hTreeGroup.PackedTable[bits]; + ref HuffmanCode huff = ref hTreeGroup.PackedTable[bits]; HuffmanCode hCode = hTreeGroup.HTrees[HuffIndex.Green][bits]; if (hCode.Value >= WebpConstants.NumLiteralCodes) { @@ -848,10 +848,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { huff.BitsUsed = 0; huff.Value = 0; - bits >>= AccumulateHCode(hCode, 8, huff); - bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Red][bits], 16, huff); - bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Blue][bits], 0, huff); - bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Alpha][bits], 24, huff); + bits >>= AccumulateHCode(hCode, 8, ref huff); + bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Red][bits], 16, ref huff); + bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Blue][bits], 0, ref huff); + bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Alpha][bits], 24, ref huff); } } } @@ -992,7 +992,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } [MethodImpl(InliningOptions.ShortMethod)] - private static int AccumulateHCode(HuffmanCode hCode, int shift, HuffmanCode huff) + private static int AccumulateHCode(HuffmanCode hCode, int shift, ref HuffmanCode huff) { huff.BitsUsed += hCode.BitsUsed; huff.Value |= hCode.Value << shift; diff --git a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs index 5306a8c786..f7eef0d85c 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs @@ -5,7 +5,7 @@ using SixLabors.ImageSharp.Formats.Webp.Lossless; using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; -namespace SixLabors.ImageSharp.Tests.Formats.WebP +namespace SixLabors.ImageSharp.Tests.Formats.Webp { [Trait("Format", "Webp")] public class ColorSpaceTransformUtilsTests diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index 69a24843ca..907b18300c 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -6,7 +6,7 @@ using SixLabors.ImageSharp.Formats.Webp.Lossy; using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; -namespace SixLabors.ImageSharp.Tests.Formats.WebP +namespace SixLabors.ImageSharp.Tests.Formats.Webp { [Trait("Format", "Webp")] public class LossyUtilsTests diff --git a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs index 55738199b7..80b5f0a531 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs @@ -6,7 +6,7 @@ using SixLabors.ImageSharp.Formats.Webp.Lossy; using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; -namespace SixLabors.ImageSharp.Tests.Formats.WebP +namespace SixLabors.ImageSharp.Tests.Formats.Webp { [Trait("Format", "Webp")] public class QuantEncTests diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs index 17c9beb9b7..6bcb4f21f4 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -6,7 +6,7 @@ using SixLabors.ImageSharp.Formats.Webp.Lossy; using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; -namespace SixLabors.ImageSharp.Tests.Formats.WebP +namespace SixLabors.ImageSharp.Tests.Formats.Webp { [Trait("Format", "Webp")] public class Vp8EncodingTests From 44316b223157d59873753548cedfe9b176fd9e80 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sat, 20 Nov 2021 18:01:45 +0100 Subject: [PATCH 41/69] Make HTreeGroup a struct --- src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs b/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs index 09ceb0334a..6c2217eb6e 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs @@ -13,12 +13,16 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// - UsePackedTable: few enough literal symbols, so all the bit codes can fit into a small look-up table PackedTable[] /// The common literal base, if applicable, is stored in 'LiteralArb'. /// - internal class HTreeGroup + internal struct HTreeGroup { public HTreeGroup(uint packedTableSize) { this.HTrees = new List(WebpConstants.HuffmanCodesPerMetaCode); this.PackedTable = new HuffmanCode[packedTableSize]; + this.IsTrivialCode = false; + this.IsTrivialLiteral = false; + this.LiteralArb = 0; + this.UsePackedTable = false; } /// From c712f98e055caa7a23eedcce1bc8cfe476664453 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sat, 20 Nov 2021 18:17:55 +0100 Subject: [PATCH 42/69] Change CodeLengthCodeOrder, LiteralMap and Norm to ReadOnlySpan --- .../Formats/Webp/Lossless/WebpLosslessDecoder.cs | 13 ++++++------- src/ImageSharp/Formats/Webp/WebpLookupTables.cs | 3 ++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs b/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs index 82fd13c7d5..82bd32a020 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs @@ -65,15 +65,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless FixedTableSize + 2704 }; - private static readonly byte[] CodeLengthCodeOrder = { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; - private static readonly int NumCodeLengthCodes = CodeLengthCodeOrder.Length; - private static readonly byte[] LiteralMap = - { - 0, 1, 1, 1, 0 - }; - /// /// Initializes a new instance of the class. /// @@ -87,6 +80,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless this.configuration = configuration; } + // This uses C#'s compiler optimization to refer to assembly's static data directly. + private static ReadOnlySpan CodeLengthCodeOrder => new byte[] { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + + // This uses C#'s compiler optimization to refer to assembly's static data directly. + private static ReadOnlySpan LiteralMap => new byte[] { 0, 1, 1, 1, 0 }; + /// /// Decodes the image from the stream using the bitreader. /// diff --git a/src/ImageSharp/Formats/Webp/WebpLookupTables.cs b/src/ImageSharp/Formats/Webp/WebpLookupTables.cs index bf47b01bca..c894114354 100644 --- a/src/ImageSharp/Formats/Webp/WebpLookupTables.cs +++ b/src/ImageSharp/Formats/Webp/WebpLookupTables.cs @@ -239,7 +239,8 @@ namespace SixLabors.ImageSharp.Formats.Webp } }; - public static readonly byte[] Norm = + // This uses C#'s compiler optimization to refer to assembly's static data directly. + public static ReadOnlySpan Norm => new byte[] { // renorm_sizes[i] = 8 - log2(i) 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, From af7c3c509c6dcc2359ad36e21749a59b20ba0ceb Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 04:38:12 +0300 Subject: [PATCH 43/69] Added generic benchmark --- .../Codecs/Jpeg/DecodeJpeg.cs | 81 +++++++++++++++++++ .../Formats/Jpg/JpegDecoderTests.Metadata.cs | 2 +- tests/ImageSharp.Tests/TestImages.cs | 3 +- .../Jpg/baseline/winter444_interleaved.jpg | 3 + ...inter.jpg => winter420_noninterleaved.jpg} | 0 5 files changed, 87 insertions(+), 2 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs create mode 100644 tests/Images/Input/Jpg/baseline/winter444_interleaved.jpg rename tests/Images/Input/Jpg/progressive/{winter.jpg => winter420_noninterleaved.jpg} (100%) diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs new file mode 100644 index 0000000000..7a878738d6 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs @@ -0,0 +1,81 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System.IO; +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Formats.Jpeg; +using SixLabors.ImageSharp.Tests; + +namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg +{ + public class DecodeJpeg + { + private JpegDecoder decoder; + + private MemoryStream preloadedImageStream; + + private void GenericSetup(string imageSubpath) + { + this.decoder = new JpegDecoder(); + byte[] bytes = File.ReadAllBytes(Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, imageSubpath)); + this.preloadedImageStream = new MemoryStream(bytes); + } + + private void GenericBechmark() + { + this.preloadedImageStream.Position = 0; + using Image img = this.decoder.Decode(Configuration.Default, this.preloadedImageStream); + } + + [GlobalSetup(Target = nameof(JpegBaselineInterleaved444))] + public void SetupBaselineInterleaved444() => + this.GenericSetup(TestImages.Jpeg.Baseline.Winter444_Interleaved); + + [GlobalSetup(Target = nameof(JpegBaselineInterleaved420))] + public void SetupBaselineInterleaved420() => + this.GenericSetup(TestImages.Jpeg.Baseline.Hiyamugi); + + [GlobalSetup(Target = nameof(JpegBaseline400))] + public void SetupBaselineSingleComponent() => + this.GenericSetup(TestImages.Jpeg.Baseline.Jpeg400); + + [GlobalSetup(Target = nameof(JpegProgressiveNonInterleaved420))] + public void SetupProgressiveNoninterleaved420() => + this.GenericSetup(TestImages.Jpeg.Progressive.Winter420_NonInterleaved); + + [GlobalCleanup] + public void Cleanup() + { + this.preloadedImageStream.Dispose(); + this.preloadedImageStream = null; + } + + [Benchmark(Description = "Baseline 4:4:4 Interleaved")] + public void JpegBaselineInterleaved444() => this.GenericBechmark(); + + [Benchmark(Description = "Baseline 4:2:0 Interleaved")] + public void JpegBaselineInterleaved420() => this.GenericBechmark(); + + [Benchmark(Description = "Baseline 4:0:0 (grayscale)")] + public void JpegBaseline400() => this.GenericBechmark(); + + [Benchmark(Description = "Progressive 4:2:0 Non-Interleaved")] + public void JpegProgressiveNonInterleaved420() => this.GenericBechmark(); + } +} + + +/* +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1288 (20H2/October2020Update) +Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores +.NET SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + DefaultJob : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT +MASTER +| Method | Mean | Error | StdDev | +|------------------------------------ |----------:|----------:|----------:| +| 'Baseline 4:4:4 Interleaved' | 12.710 ms | 0.1120 ms | 0.0990 ms | +| 'Baseline 4:2:0 Interleaved' | 8.855 ms | 0.1447 ms | 0.1353 ms | +| 'Baseline 4:0:0 (grayscale)' | 1.660 ms | 0.0106 ms | 0.0088 ms | +| 'Progressive 4:2:0 Non-Interleaved' | 14.138 ms | 0.2797 ms | 0.3330 ms | +*/ diff --git a/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Metadata.cs b/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Metadata.cs index 5e42c6c8fc..7b3e20aa2a 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Metadata.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Metadata.cs @@ -56,7 +56,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { TestImages.Jpeg.Progressive.Fb, 75 }, { TestImages.Jpeg.Issues.IncorrectQuality845, 98 }, { TestImages.Jpeg.Baseline.ForestBridgeDifferentComponentsQuality, 89 }, - { TestImages.Jpeg.Progressive.Winter, 80 } + { TestImages.Jpeg.Progressive.Winter420_NonInterleaved, 80 } }; [Theory] diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs index 116c5adc34..e003649135 100644 --- a/tests/ImageSharp.Tests/TestImages.cs +++ b/tests/ImageSharp.Tests/TestImages.cs @@ -163,7 +163,7 @@ namespace SixLabors.ImageSharp.Tests public const string Fb = "Jpg/progressive/fb.jpg"; public const string Progress = "Jpg/progressive/progress.jpg"; public const string Festzug = "Jpg/progressive/Festzug.jpg"; - public const string Winter = "Jpg/progressive/winter.jpg"; + public const string Winter420_NonInterleaved = "Jpg/progressive/winter420_noninterleaved.jpg"; public static class Bad { @@ -213,6 +213,7 @@ namespace SixLabors.ImageSharp.Tests public const string ArithmeticCoding = "Jpg/baseline/arithmetic_coding.jpg"; public const string ArithmeticCodingProgressive = "Jpg/progressive/arithmetic_progressive.jpg"; public const string Lossless = "Jpg/baseline/lossless.jpg"; + public const string Winter444_Interleaved = "Jpg/baseline/winter444_interleaved.jpg"; public static readonly string[] All = { diff --git a/tests/Images/Input/Jpg/baseline/winter444_interleaved.jpg b/tests/Images/Input/Jpg/baseline/winter444_interleaved.jpg new file mode 100644 index 0000000000..9ae834389f --- /dev/null +++ b/tests/Images/Input/Jpg/baseline/winter444_interleaved.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73b1deb4e2fb8027f6bb4fb293e5b2615c80b3ac0a7f99fd90118fd340a9fd12 +size 283330 diff --git a/tests/Images/Input/Jpg/progressive/winter.jpg b/tests/Images/Input/Jpg/progressive/winter420_noninterleaved.jpg similarity index 100% rename from tests/Images/Input/Jpg/progressive/winter.jpg rename to tests/Images/Input/Jpg/progressive/winter420_noninterleaved.jpg From cdb2a648a6cdc2061ee399ceae4a96e7e9b59f3b Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 04:47:37 +0300 Subject: [PATCH 44/69] Structured benchmark files --- tests/ImageSharp.Benchmarks/Codecs/{ => Bmp}/DecodeBmp.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Bmp}/EncodeBmp.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Bmp}/EncodeBmpMultiple.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Gif}/DecodeGif.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Gif}/EncodeGif.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Gif}/EncodeGifMultiple.cs | 0 .../Codecs/Jpeg/{ => ColorConversion}/CmykColorConversion.cs | 0 .../Codecs/Jpeg/{ => ColorConversion}/ColorConversionBenchmark.cs | 0 .../Codecs/Jpeg/{ => ColorConversion}/GrayscaleColorConversion.cs | 0 .../Codecs/Jpeg/{ => ColorConversion}/RgbColorConversion.cs | 0 .../Codecs/Jpeg/{ => ColorConversion}/YCbCrColorConversion.cs | 0 .../Jpeg/ColorConversion}/YCbCrForwardConverterBenchmark.cs | 0 .../Codecs/Jpeg/{ => ColorConversion}/YccKColorConverter.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Png}/DecodeFilteredPng.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Png}/DecodePng.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Png}/EncodeIndexedPng.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Png}/EncodePng.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Tga}/DecodeTga.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Tga}/EncodeTga.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Tiff}/DecodeTiff.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Tiff}/EncodeTiff.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Webp}/DecodeWebp.cs | 0 tests/ImageSharp.Benchmarks/Codecs/{ => Webp}/EncodeWebp.cs | 0 23 files changed, 0 insertions(+), 0 deletions(-) rename tests/ImageSharp.Benchmarks/Codecs/{ => Bmp}/DecodeBmp.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Bmp}/EncodeBmp.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Bmp}/EncodeBmpMultiple.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Gif}/DecodeGif.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Gif}/EncodeGif.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Gif}/EncodeGifMultiple.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/Jpeg/{ => ColorConversion}/CmykColorConversion.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/Jpeg/{ => ColorConversion}/ColorConversionBenchmark.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/Jpeg/{ => ColorConversion}/GrayscaleColorConversion.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/Jpeg/{ => ColorConversion}/RgbColorConversion.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/Jpeg/{ => ColorConversion}/YCbCrColorConversion.cs (100%) rename tests/ImageSharp.Benchmarks/{Format/Jpeg/Components/Encoder => Codecs/Jpeg/ColorConversion}/YCbCrForwardConverterBenchmark.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/Jpeg/{ => ColorConversion}/YccKColorConverter.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Png}/DecodeFilteredPng.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Png}/DecodePng.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Png}/EncodeIndexedPng.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Png}/EncodePng.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Tga}/DecodeTga.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Tga}/EncodeTga.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Tiff}/DecodeTiff.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Tiff}/EncodeTiff.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Webp}/DecodeWebp.cs (100%) rename tests/ImageSharp.Benchmarks/Codecs/{ => Webp}/EncodeWebp.cs (100%) diff --git a/tests/ImageSharp.Benchmarks/Codecs/DecodeBmp.cs b/tests/ImageSharp.Benchmarks/Codecs/Bmp/DecodeBmp.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/DecodeBmp.cs rename to tests/ImageSharp.Benchmarks/Codecs/Bmp/DecodeBmp.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeBmp.cs b/tests/ImageSharp.Benchmarks/Codecs/Bmp/EncodeBmp.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/EncodeBmp.cs rename to tests/ImageSharp.Benchmarks/Codecs/Bmp/EncodeBmp.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeBmpMultiple.cs b/tests/ImageSharp.Benchmarks/Codecs/Bmp/EncodeBmpMultiple.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/EncodeBmpMultiple.cs rename to tests/ImageSharp.Benchmarks/Codecs/Bmp/EncodeBmpMultiple.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/DecodeGif.cs b/tests/ImageSharp.Benchmarks/Codecs/Gif/DecodeGif.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/DecodeGif.cs rename to tests/ImageSharp.Benchmarks/Codecs/Gif/DecodeGif.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeGif.cs b/tests/ImageSharp.Benchmarks/Codecs/Gif/EncodeGif.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/EncodeGif.cs rename to tests/ImageSharp.Benchmarks/Codecs/Gif/EncodeGif.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeGifMultiple.cs b/tests/ImageSharp.Benchmarks/Codecs/Gif/EncodeGifMultiple.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/EncodeGifMultiple.cs rename to tests/ImageSharp.Benchmarks/Codecs/Gif/EncodeGifMultiple.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/CmykColorConversion.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/CmykColorConversion.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/Jpeg/CmykColorConversion.cs rename to tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/CmykColorConversion.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversionBenchmark.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/ColorConversionBenchmark.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversionBenchmark.cs rename to tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/ColorConversionBenchmark.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/GrayscaleColorConversion.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/GrayscaleColorConversion.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/Jpeg/GrayscaleColorConversion.cs rename to tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/GrayscaleColorConversion.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/RgbColorConversion.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/RgbColorConversion.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/Jpeg/RgbColorConversion.cs rename to tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/RgbColorConversion.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/YCbCrColorConversion.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/YCbCrColorConversion.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/Jpeg/YCbCrColorConversion.cs rename to tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/YCbCrColorConversion.cs diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/YCbCrForwardConverterBenchmark.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs rename to tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/YCbCrForwardConverterBenchmark.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/YccKColorConverter.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/YccKColorConverter.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/Jpeg/YccKColorConverter.cs rename to tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/YccKColorConverter.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/DecodeFilteredPng.cs b/tests/ImageSharp.Benchmarks/Codecs/Png/DecodeFilteredPng.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/DecodeFilteredPng.cs rename to tests/ImageSharp.Benchmarks/Codecs/Png/DecodeFilteredPng.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/DecodePng.cs b/tests/ImageSharp.Benchmarks/Codecs/Png/DecodePng.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/DecodePng.cs rename to tests/ImageSharp.Benchmarks/Codecs/Png/DecodePng.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeIndexedPng.cs b/tests/ImageSharp.Benchmarks/Codecs/Png/EncodeIndexedPng.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/EncodeIndexedPng.cs rename to tests/ImageSharp.Benchmarks/Codecs/Png/EncodeIndexedPng.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodePng.cs b/tests/ImageSharp.Benchmarks/Codecs/Png/EncodePng.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/EncodePng.cs rename to tests/ImageSharp.Benchmarks/Codecs/Png/EncodePng.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/DecodeTga.cs b/tests/ImageSharp.Benchmarks/Codecs/Tga/DecodeTga.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/DecodeTga.cs rename to tests/ImageSharp.Benchmarks/Codecs/Tga/DecodeTga.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeTga.cs b/tests/ImageSharp.Benchmarks/Codecs/Tga/EncodeTga.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/EncodeTga.cs rename to tests/ImageSharp.Benchmarks/Codecs/Tga/EncodeTga.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/DecodeTiff.cs b/tests/ImageSharp.Benchmarks/Codecs/Tiff/DecodeTiff.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/DecodeTiff.cs rename to tests/ImageSharp.Benchmarks/Codecs/Tiff/DecodeTiff.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeTiff.cs b/tests/ImageSharp.Benchmarks/Codecs/Tiff/EncodeTiff.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/EncodeTiff.cs rename to tests/ImageSharp.Benchmarks/Codecs/Tiff/EncodeTiff.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/DecodeWebp.cs b/tests/ImageSharp.Benchmarks/Codecs/Webp/DecodeWebp.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/DecodeWebp.cs rename to tests/ImageSharp.Benchmarks/Codecs/Webp/DecodeWebp.cs diff --git a/tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs b/tests/ImageSharp.Benchmarks/Codecs/Webp/EncodeWebp.cs similarity index 100% rename from tests/ImageSharp.Benchmarks/Codecs/EncodeWebp.cs rename to tests/ImageSharp.Benchmarks/Codecs/Webp/EncodeWebp.cs From e143093d98c38082d64e7c0e83d0793fd7c50e8a Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 04:52:40 +0300 Subject: [PATCH 45/69] Removed excess comment --- tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs index 7a878738d6..842eea685a 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs @@ -71,7 +71,7 @@ Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores .NET SDK=6.0.100-preview.3.21202.5 [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT DefaultJob : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT -MASTER + | Method | Mean | Error | StdDev | |------------------------------------ |----------:|----------:|----------:| | 'Baseline 4:4:4 Interleaved' | 12.710 ms | 0.1120 ms | 0.0990 ms | From 582fa51229cc9a8c0a8f600546ea3d9f5acfaa7d Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 05:53:36 +0300 Subject: [PATCH 46/69] Fixed jpeg component mcu size calculation bug --- .../Components/Decoder/HuffmanScanDecoder.cs | 1 + .../Components/Decoder/SpectralConverter.cs | 19 +++++++++++++ .../Decoder/SpectralConverter{TPixel}.cs | 28 ++++++++----------- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs index bc9a53ea04..622657c48f 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs @@ -151,6 +151,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder if (this.componentsCount == this.frame.ComponentCount) { this.ParseBaselineDataInterleaved(); + this.spectralConverter.CommitConvertion(); } else { diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs index e975b11fbb..aca9dc36b3 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs @@ -13,6 +13,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder /// internal abstract class SpectralConverter { + /// + /// Gets a value indicating whether this converter has converted spectral + /// data of the current image or not. + /// + protected bool Converted { get; private set; } + /// /// Injects jpeg image decoding metadata. /// @@ -33,6 +39,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder /// public abstract void ConvertStrideBaseline(); + /// + /// Marks current converter state as 'converted'. + /// + /// + /// This must be called only for baseline interleaved jpeg's. + /// + public void CommitConvertion() + { + DebugGuard.IsFalse(this.Converted, nameof(this.Converted), $"{nameof(this.CommitConvertion)} must be called only once"); + + this.Converted = true; + } + /// /// Gets the color converter. /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs index ec7f3e5c30..2e965e0ac3 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs @@ -3,6 +3,7 @@ using System; using System.Buffers; +using System.Linq; using System.Numerics; using System.Threading; using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters; @@ -29,8 +30,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder private Buffer2D pixelBuffer; - private int blockRowsPerStep; - private int pixelRowsPerStep; private int pixelRowCounter; @@ -41,8 +40,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder this.cancellationToken = cancellationToken; } - private bool Converted => this.pixelRowCounter >= this.pixelBuffer.Height; - public Buffer2D GetPixelBuffer() { if (!this.Converted) @@ -52,7 +49,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder for (int step = 0; step < steps; step++) { this.cancellationToken.ThrowIfCancellationRequested(); - this.ConvertNextStride(step); + this.ConvertStride(step); } } @@ -65,18 +62,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder MemoryAllocator allocator = this.configuration.MemoryAllocator; // iteration data - IJpegComponent c0 = frame.Components[0]; + int majorBlockWidth = frame.Components.Max((component) => component.SizeInBlocks.Width); + int majorVerticalSamplingFactor = frame.Components.Max((component) => component.SamplingFactors.Height); const int blockPixelHeight = 8; - this.blockRowsPerStep = c0.SamplingFactors.Height; - this.pixelRowsPerStep = this.blockRowsPerStep * blockPixelHeight; + this.pixelRowsPerStep = majorVerticalSamplingFactor * blockPixelHeight; // pixel buffer for resulting image this.pixelBuffer = allocator.Allocate2D(frame.PixelWidth, frame.PixelHeight); this.paddedProxyPixelRow = allocator.Allocate(frame.PixelWidth + 3); // component processors from spectral to Rgba32 - var postProcessorBufferSize = new Size(c0.SizeInBlocks.Width * 8, this.pixelRowsPerStep); + const int blockPixelWidth = 8; + var postProcessorBufferSize = new Size(majorBlockWidth * blockPixelWidth, this.pixelRowsPerStep); this.componentProcessors = new JpegComponentPostProcessor[frame.Components.Length]; for (int i = 0; i < this.componentProcessors.Length; i++) { @@ -84,7 +82,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder } // single 'stride' rgba32 buffer for conversion between spectral and TPixel - // this.rgbaBuffer = allocator.Allocate(frame.PixelWidth); this.rgbBuffer = allocator.Allocate(frame.PixelWidth * 3); // color converter from Rgba32 to TPixel @@ -95,18 +92,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder public override void ConvertStrideBaseline() { // Convert next pixel stride using single spectral `stride' - // Note that zero passing eliminates the need of virtual call from JpegComponentPostProcessor - this.ConvertNextStride(spectralStep: 0); + // Note that zero passing eliminates the need of virtual call + // from JpegComponentPostProcessor + this.ConvertStride(spectralStep: 0); - // Clear spectral stride - this is VERY important as jpeg possibly won't fill entire buffer each stride - // Which leads to decoding artifacts - // Note that this code clears all buffers of the post processors, it's their responsibility to allocate only single stride foreach (JpegComponentPostProcessor cpp in this.componentProcessors) { cpp.ClearSpectralBuffers(); } } + /// public void Dispose() { if (this.componentProcessors != null) @@ -121,7 +117,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder this.paddedProxyPixelRow?.Dispose(); } - private void ConvertNextStride(int spectralStep) + private void ConvertStride(int spectralStep) { int maxY = Math.Min(this.pixelBuffer.Height, this.pixelRowCounter + this.pixelRowsPerStep); From e1c0a39c9d2c8ef712af09f7694068efd0d323bd Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 06:25:31 +0300 Subject: [PATCH 47/69] Added bug proof image to jpeg decoder test suit --- tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Images.cs | 2 ++ .../JpegDecoderTests/DecodeBaselineJpeg_jpeg422.png | 3 +++ 2 files changed, 5 insertions(+) create mode 100644 tests/Images/External/ReferenceOutput/JpegDecoderTests/DecodeBaselineJpeg_jpeg422.png diff --git a/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Images.cs b/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Images.cs index d12240cba3..ef817154d6 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Images.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Images.cs @@ -20,6 +20,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg TestImages.Jpeg.Baseline.Jpeg420Small, TestImages.Jpeg.Issues.Fuzz.AccessViolationException922, TestImages.Jpeg.Baseline.Jpeg444, + TestImages.Jpeg.Baseline.Jpeg422, TestImages.Jpeg.Baseline.Bad.BadEOF, TestImages.Jpeg.Baseline.MultiScanBaselineCMYK, TestImages.Jpeg.Baseline.YcckSubsample1222, @@ -100,6 +101,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg [TestImages.Jpeg.Baseline.Bad.BadEOF] = 0.38f / 100, [TestImages.Jpeg.Baseline.Bad.BadRST] = 0.0589f / 100, + [TestImages.Jpeg.Baseline.Jpeg422] = 0.0013f / 100, [TestImages.Jpeg.Baseline.Testorig420] = 0.38f / 100, [TestImages.Jpeg.Baseline.Jpeg420Small] = 0.287f / 100, [TestImages.Jpeg.Baseline.Turtle420] = 1.0f / 100, diff --git a/tests/Images/External/ReferenceOutput/JpegDecoderTests/DecodeBaselineJpeg_jpeg422.png b/tests/Images/External/ReferenceOutput/JpegDecoderTests/DecodeBaselineJpeg_jpeg422.png new file mode 100644 index 0000000000..018ecda7a5 --- /dev/null +++ b/tests/Images/External/ReferenceOutput/JpegDecoderTests/DecodeBaselineJpeg_jpeg422.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:733cc46271c4402974db2536a55e6ecae3110856df73031ca48dad03745d852d +size 35375 From 5e40977eb033fbc3296561f5ebf30cef2bff7467 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 06:36:57 +0300 Subject: [PATCH 48/69] Fixed typo --- .../Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs | 2 +- .../Formats/Jpeg/Components/Decoder/SpectralConverter.cs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs index 622657c48f..ad09b50655 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs @@ -151,7 +151,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder if (this.componentsCount == this.frame.ComponentCount) { this.ParseBaselineDataInterleaved(); - this.spectralConverter.CommitConvertion(); + this.spectralConverter.CommitConversion(); } else { diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs index aca9dc36b3..4e74f62269 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs @@ -45,9 +45,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder /// /// This must be called only for baseline interleaved jpeg's. /// - public void CommitConvertion() + public void CommitConversion() { - DebugGuard.IsFalse(this.Converted, nameof(this.Converted), $"{nameof(this.CommitConvertion)} must be called only once"); + DebugGuard.IsFalse(this.Converted, nameof(this.Converted), $"{nameof(this.CommitConversion)} must be called only once"); this.Converted = true; } From 2a182d75637702ae8c633545dba1b7fe032bd8b2 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 07:07:03 +0300 Subject: [PATCH 49/69] Updated benchmark results --- tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs index 842eea685a..cb89f90829 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs @@ -74,8 +74,8 @@ Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores | Method | Mean | Error | StdDev | |------------------------------------ |----------:|----------:|----------:| -| 'Baseline 4:4:4 Interleaved' | 12.710 ms | 0.1120 ms | 0.0990 ms | -| 'Baseline 4:2:0 Interleaved' | 8.855 ms | 0.1447 ms | 0.1353 ms | -| 'Baseline 4:0:0 (grayscale)' | 1.660 ms | 0.0106 ms | 0.0088 ms | -| 'Progressive 4:2:0 Non-Interleaved' | 14.138 ms | 0.2797 ms | 0.3330 ms | +| 'Baseline 4:4:4 Interleaved' | 11.781 ms | 0.0737 ms | 0.0654 ms | +| 'Baseline 4:2:0 Interleaved' | 8.688 ms | 0.0345 ms | 0.0306 ms | +| 'Baseline 4:0:0 (grayscale)' | 1.643 ms | 0.0092 ms | 0.0086 ms | +| 'Progressive 4:2:0 Non-Interleaved' | 13.770 ms | 0.0928 ms | 0.0823 ms | */ From 408462a4ac358abd10ce8d1f2d542bd811915992 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 09:09:18 +0300 Subject: [PATCH 50/69] Added new IDCT implementation --- .../FastFloatingPointDCT.Intrinsic.cs | 237 ++++---- .../Jpeg/Components/FastFloatingPointDCT.cs | 525 ++++++++---------- 2 files changed, 346 insertions(+), 416 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index ab9462632f..94864005ec 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -2,9 +2,6 @@ // Licensed under the Apache License, Version 2.0. #if SUPPORTS_RUNTIME_INTRINSICS -using System.Diagnostics; -using System.Numerics; -using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -12,149 +9,147 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { internal static partial class FastFloatingPointDCT { -#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings +#pragma warning disable SA1310, SA1311, IDE1006 // naming rule violation warnings private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f); private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f); private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f); private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f); - private static readonly Vector256 mm256_F_1_1758 = Vector256.Create(1.175876f); - private static readonly Vector256 mm256_F_n1_9615 = Vector256.Create(-1.961570560f); - private static readonly Vector256 mm256_F_n0_3901 = Vector256.Create(-0.390180644f); - private static readonly Vector256 mm256_F_n0_8999 = Vector256.Create(-0.899976223f); - private static readonly Vector256 mm256_F_n2_5629 = Vector256.Create(-2.562915447f); - private static readonly Vector256 mm256_F_0_2986 = Vector256.Create(0.298631336f); - private static readonly Vector256 mm256_F_2_0531 = Vector256.Create(2.053119869f); - private static readonly Vector256 mm256_F_3_0727 = Vector256.Create(3.072711026f); - private static readonly Vector256 mm256_F_1_5013 = Vector256.Create(1.501321110f); - private static readonly Vector256 mm256_F_n1_8477 = Vector256.Create(-1.847759065f); - private static readonly Vector256 mm256_F_0_7653 = Vector256.Create(0.765366865f); + private static readonly Vector256 mm256_F_1_4142 = Vector256.Create(1.414213562f); + private static readonly Vector256 mm256_F_1_8477 = Vector256.Create(1.847759065f); + private static readonly Vector256 mm256_F_n1_0823 = Vector256.Create(-1.082392200f); + private static readonly Vector256 mm256_F_n2_6131 = Vector256.Create(-2.613125930f); #pragma warning restore SA1310, SA1311, IDE1006 /// /// Apply floating point FDCT inplace using simd operations. /// - /// Input matrix. - private static void ForwardTransform_Avx(ref Block8x8F block) + /// Input block. + private static void FDCT8x8_Avx(ref Block8x8F block) { DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); // First pass - process rows block.TransposeInplace(); - FDCT8x8_Avx(ref block); + FDCT8x8_1D_Avx(ref block); // Second pass - process columns block.TransposeInplace(); - FDCT8x8_Avx(ref block); + FDCT8x8_1D_Avx(ref block); + + // Applies 1D floating point FDCT inplace + static void FDCT8x8_1D_Avx(ref Block8x8F block) + { + Vector256 tmp0 = Avx.Add(block.V0, block.V7); + Vector256 tmp7 = Avx.Subtract(block.V0, block.V7); + Vector256 tmp1 = Avx.Add(block.V1, block.V6); + Vector256 tmp6 = Avx.Subtract(block.V1, block.V6); + Vector256 tmp2 = Avx.Add(block.V2, block.V5); + Vector256 tmp5 = Avx.Subtract(block.V2, block.V5); + Vector256 tmp3 = Avx.Add(block.V3, block.V4); + Vector256 tmp4 = Avx.Subtract(block.V3, block.V4); + + // Even part + Vector256 tmp10 = Avx.Add(tmp0, tmp3); + Vector256 tmp13 = Avx.Subtract(tmp0, tmp3); + Vector256 tmp11 = Avx.Add(tmp1, tmp2); + Vector256 tmp12 = Avx.Subtract(tmp1, tmp2); + + block.V0 = Avx.Add(tmp10, tmp11); + block.V4 = Avx.Subtract(tmp10, tmp11); + + Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); + block.V2 = Avx.Add(tmp13, z1); + block.V6 = Avx.Subtract(tmp13, z1); + + // Odd part + tmp10 = Avx.Add(tmp4, tmp5); + tmp11 = Avx.Add(tmp5, tmp6); + tmp12 = Avx.Add(tmp6, tmp7); + + Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); + Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10); + Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12); + Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071); + + Vector256 z11 = Avx.Add(tmp7, z3); + Vector256 z13 = Avx.Subtract(tmp7, z3); + + block.V5 = Avx.Add(z13, z2); + block.V3 = Avx.Subtract(z13, z2); + block.V1 = Avx.Add(z11, z4); + block.V7 = Avx.Subtract(z11, z4); + } } /// - /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix. + /// Apply floating point IDCT inplace using simd operations. /// - /// - /// Requires Avx support. - /// - /// Input matrix. - public static void FDCT8x8_Avx(ref Block8x8F block) + /// Transposed input block. + private static void IDCT8x8_Avx(ref Block8x8F transposedBlock) { DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); - Vector256 tmp0 = Avx.Add(block.V0, block.V7); - Vector256 tmp7 = Avx.Subtract(block.V0, block.V7); - Vector256 tmp1 = Avx.Add(block.V1, block.V6); - Vector256 tmp6 = Avx.Subtract(block.V1, block.V6); - Vector256 tmp2 = Avx.Add(block.V2, block.V5); - Vector256 tmp5 = Avx.Subtract(block.V2, block.V5); - Vector256 tmp3 = Avx.Add(block.V3, block.V4); - Vector256 tmp4 = Avx.Subtract(block.V3, block.V4); - - // Even part - Vector256 tmp10 = Avx.Add(tmp0, tmp3); - Vector256 tmp13 = Avx.Subtract(tmp0, tmp3); - Vector256 tmp11 = Avx.Add(tmp1, tmp2); - Vector256 tmp12 = Avx.Subtract(tmp1, tmp2); - - block.V0 = Avx.Add(tmp10, tmp11); - block.V4 = Avx.Subtract(tmp10, tmp11); - - Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); - block.V2 = Avx.Add(tmp13, z1); - block.V6 = Avx.Subtract(tmp13, z1); - - // Odd part - tmp10 = Avx.Add(tmp4, tmp5); - tmp11 = Avx.Add(tmp5, tmp6); - tmp12 = Avx.Add(tmp6, tmp7); - - Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); - Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10); - Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12); - Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071); - - Vector256 z11 = Avx.Add(tmp7, z3); - Vector256 z13 = Avx.Subtract(tmp7, z3); - - block.V5 = Avx.Add(z13, z2); - block.V3 = Avx.Subtract(z13, z2); - block.V1 = Avx.Add(z11, z4); - block.V7 = Avx.Subtract(z11, z4); - } - - /// - /// Combined operation of and - /// using AVX commands. - /// - /// Source - /// Destination - public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) - { - Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); - - Vector256 my1 = s.V1; - Vector256 my7 = s.V7; - Vector256 mz0 = Avx.Add(my1, my7); - - Vector256 my3 = s.V3; - Vector256 mz2 = Avx.Add(my3, my7); - Vector256 my5 = s.V5; - Vector256 mz1 = Avx.Add(my3, my5); - Vector256 mz3 = Avx.Add(my1, my5); - - Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758); - - mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615); - mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901); - mz0 = Avx.Multiply(mz0, mm256_F_n0_8999); - mz1 = Avx.Multiply(mz1, mm256_F_n2_5629); - - Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2); - Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3); - Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2); - Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3); - - Vector256 my2 = s.V2; - Vector256 my6 = s.V6; - mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411); - Vector256 my0 = s.V0; - Vector256 my4 = s.V4; - mz0 = Avx.Add(my0, my4); - mz1 = Avx.Subtract(my0, my4); - mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477); - mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653); - - my0 = Avx.Add(mz0, mz3); - my3 = Avx.Subtract(mz0, mz3); - my1 = Avx.Add(mz1, mz2); - my2 = Avx.Subtract(mz1, mz2); - - d.V0 = Avx.Add(my0, mb0); - d.V7 = Avx.Subtract(my0, mb0); - d.V1 = Avx.Add(my1, mb1); - d.V6 = Avx.Subtract(my1, mb1); - d.V2 = Avx.Add(my2, mb2); - d.V5 = Avx.Subtract(my2, mb2); - d.V3 = Avx.Add(my3, mb3); - d.V4 = Avx.Subtract(my3, mb3); + // First pass - process columns + IDCT8x8_1D_Avx(ref transposedBlock); + + // Second pass - process rows + transposedBlock.TransposeInplace(); + IDCT8x8_1D_Avx(ref transposedBlock); + + // Applies 1D floating point FDCT inplace + static void IDCT8x8_1D_Avx(ref Block8x8F block) + { + // Even part + Vector256 tmp0 = block.V0; + Vector256 tmp1 = block.V2; + Vector256 tmp2 = block.V4; + Vector256 tmp3 = block.V6; + + Vector256 z5 = tmp0; + Vector256 tmp10 = Avx.Add(z5, tmp2); + Vector256 tmp11 = Avx.Subtract(z5, tmp2); + + Vector256 tmp13 = Avx.Add(tmp1, tmp3); + Vector256 tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142); + + tmp0 = Avx.Add(tmp10, tmp13); + tmp3 = Avx.Subtract(tmp10, tmp13); + tmp1 = Avx.Add(tmp11, tmp12); + tmp2 = Avx.Subtract(tmp11, tmp12); + + // Odd part + Vector256 tmp4 = block.V1; + Vector256 tmp5 = block.V3; + Vector256 tmp6 = block.V5; + Vector256 tmp7 = block.V7; + + Vector256 z13 = Avx.Add(tmp6, tmp5); + Vector256 z10 = Avx.Subtract(tmp6, tmp5); + Vector256 z11 = Avx.Add(tmp4, tmp7); + Vector256 z12 = Avx.Subtract(tmp4, tmp7); + + tmp7 = Avx.Add(z11, z13); + tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142); + + z5 = Avx.Multiply(Avx.Add(z10, z12), mm256_F_1_8477); + + tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, mm256_F_n1_0823); + tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, mm256_F_n2_6131); + + tmp6 = Avx.Subtract(tmp12, tmp7); + tmp5 = Avx.Subtract(tmp11, tmp6); + tmp4 = Avx.Subtract(tmp10, tmp5); + + block.V0 = Avx.Add(tmp0, tmp7); + block.V7 = Avx.Subtract(tmp0, tmp7); + block.V1 = Avx.Add(tmp1, tmp6); + block.V6 = Avx.Subtract(tmp1, tmp6); + block.V2 = Avx.Add(tmp2, tmp5); + block.V5 = Avx.Subtract(tmp2, tmp5); + block.V3 = Avx.Add(tmp3, tmp4); + block.V4 = Avx.Subtract(tmp3, tmp4); + } } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 6963c36369..a1c03e65c0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -15,102 +15,196 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// internal static partial class FastFloatingPointDCT { -#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore - private const float C_1_175876 = 1.175875602f; - private const float C_1_961571 = -1.961570560f; - private const float C_0_390181 = -0.390180644f; - private const float C_0_899976 = -0.899976223f; - private const float C_2_562915 = -2.562915447f; - private const float C_0_298631 = 0.298631336f; - private const float C_2_053120 = 2.053119869f; - private const float C_3_072711 = 3.072711026f; - private const float C_1_501321 = 1.501321110f; - private const float C_0_541196 = 0.541196100f; - private const float C_1_847759 = -1.847759065f; - private const float C_0_765367 = 0.765366865f; - - private const float C_0_125 = 0.1250f; - -#pragma warning disable SA1311, IDE1006 // naming rules violation warnings - private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f); - private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f); - private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f); - private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f); -#pragma warning restore SA1311, IDE1006 - -#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore +#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings + private static readonly Vector4 mm128_F_0_7071 = new(0.707106781f); + private static readonly Vector4 mm128_F_0_3826 = new(0.382683433f); + private static readonly Vector4 mm128_F_0_5411 = new(0.541196100f); + private static readonly Vector4 mm128_F_1_3065 = new(1.306562965f); + + private static readonly Vector4 mm128_F_1_4142 = new(1.414213562f); + private static readonly Vector4 mm128_F_1_8477 = new(1.847759065f); + private static readonly Vector4 mm128_F_n1_0823 = new(-1.082392200f); + private static readonly Vector4 mm128_F_n2_6131 = new(-2.613125930f); +#pragma warning restore SA1310, SA1311, IDE1006 /// - /// Gets reciprocal coefficients for jpeg quantization tables calculation. + /// Gets adjustment table for quantization tables. /// /// /// - /// Current FDCT implementation expects its results to be multiplied by - /// a reciprocal quantization table. To get 8x8 reciprocal block values in this - /// table must be divided by quantization table values scaled with quality settings. + /// Current IDCT and FDCT implementations are based on Arai, Agui, + /// and Nakajima's algorithm. Both DCT methods does not + /// produce finished DCT output, final step is fused into the + /// quantization step. Quantization and de-quantization coefficients + /// must be multiplied by these values. /// /// - /// These values were calculates with this formula: - /// - /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8; - /// - /// Where: + /// Given values were generated by formula: /// + /// scalefactor[row] * scalefactor[col], where /// scalefactor[0] = 1 - /// - /// /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 /// - /// Values are also scaled by 8 so DCT code won't do extra division/multiplication. /// /// - internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[] + private static readonly float[] AdjustmentCoefficients = new float[] { - 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, - 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, - 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, - 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, - 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, - 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, - 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, - 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, + 1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f, + 1.3870399f, 1.9238797f, 1.812255f, 1.6309863f, 1.3870399f, 1.0897902f, 0.7506606f, 0.38268346f, + 1.306563f, 1.812255f, 1.707107f, 1.5363555f, 1.306563f, 1.02656f, 0.7071068f, 0.36047992f, + 1.1758755f, 1.6309863f, 1.5363555f, 1.3826833f, 1.1758755f, 0.9238795f, 0.63637924f, 0.32442334f, + 1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f, + 0.78569496f, 1.0897902f, 1.02656f, 0.9238795f, 0.78569496f, 0.61731654f, 0.42521507f, 0.21677275f, + 0.5411961f, 0.7506606f, 0.7071068f, 0.63637924f, 0.5411961f, 0.42521507f, 0.29289323f, 0.14931567f, + 0.27589938f, 0.38268346f, 0.36047992f, 0.32442334f, 0.27589938f, 0.21677275f, 0.14931567f, 0.076120466f, }; /// - /// Adjusts given quantization table to be complient with FDCT implementation. + /// Adjusts given quantization table for usage with . + /// + /// Quantization table to adjust. + public static void AdjustToIDCT(ref Block8x8F quantTable) + { + for (int i = 0; i < Block8x8F.Size; i++) + { + quantTable[i] = quantTable[i] * AdjustmentCoefficients[i] * 0.125f; + } + + // Spectral macroblocks are transposed before quantization + // so we must transpose quantization table + quantTable.TransposeInplace(); + } + + /// + /// Adjusts given quantization table for usage with . + /// + /// Quantization table to adjust. + public static void AdjustToFDCT(ref Block8x8F quantTable) + { + for (int i = 0; i < Block8x8F.Size; i++) + { + quantTable[i] = 0.125f / (quantTable[i] * AdjustmentCoefficients[i]); + } + } + + /// + /// Apply 2D floating point IDCT inplace. /// /// - /// See docs for explanation. + /// Input block must be dequantized before this method with table + /// adjusted by . /// - /// Quantization table to adjust. - public static void AdjustToFDCT(ref Block8x8F quantizationtable) + /// Input block. + public static void TransformIDCT(ref Block8x8F block) { - for (int i = 0; i < Block8x8F.Size; i++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported) + { + IDCT8x8_Avx(ref block); + } + else +#endif { - quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i]; + IDCT_Vector4(ref block); } } /// - /// Apply 2D floating point FDCT inplace. + /// Apply 2D floating point IDCT inplace. /// - /// Input matrix. + /// + /// Input block must be quantized after this method with table adjusted + /// by . + /// + /// Input block. public static void TransformFDCT(ref Block8x8F block) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { - ForwardTransform_Avx(ref block); + FDCT8x8_Avx(ref block); } else #endif if (Vector.IsHardwareAccelerated) { - ForwardTransform_Vector4(ref block); + FDCT_Vector4(ref block); } else { - ForwardTransform_Scalar(ref block); + FDCT_Scalar(ref block); + } + } + + /// + /// Apply floating point IDCT inplace using API. + /// + /// Input block. + private static void IDCT_Vector4(ref Block8x8F transposedBlock) + { + DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware."); + + // First pass - process columns + IDCT8x4_Vector4(ref transposedBlock.V0L); + IDCT8x4_Vector4(ref transposedBlock.V0R); + + // Second pass - process rows + transposedBlock.TransposeInplace(); + IDCT8x4_Vector4(ref transposedBlock.V0L); + IDCT8x4_Vector4(ref transposedBlock.V0R); + + // Applies 1D floating point IDCT inplace on 8x4 part of 8x8 block + static void IDCT8x4_Vector4(ref Vector4 vecRef) + { + // Even part + Vector4 tmp0 = Unsafe.Add(ref vecRef, 0 * 2); + Vector4 tmp1 = Unsafe.Add(ref vecRef, 2 * 2); + Vector4 tmp2 = Unsafe.Add(ref vecRef, 4 * 2); + Vector4 tmp3 = Unsafe.Add(ref vecRef, 6 * 2); + + Vector4 z5 = tmp0; + Vector4 tmp10 = z5 + tmp2; + Vector4 tmp11 = z5 - tmp2; + + Vector4 tmp13 = tmp1 + tmp3; + Vector4 tmp12 = ((tmp1 - tmp3) * mm128_F_1_4142) - tmp13; + + tmp0 = tmp10 + tmp13; + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + // Odd part + Vector4 tmp4 = Unsafe.Add(ref vecRef, 1 * 2); + Vector4 tmp5 = Unsafe.Add(ref vecRef, 3 * 2); + Vector4 tmp6 = Unsafe.Add(ref vecRef, 5 * 2); + Vector4 tmp7 = Unsafe.Add(ref vecRef, 7 * 2); + + Vector4 z13 = tmp6 + tmp5; + Vector4 z10 = tmp6 - tmp5; + Vector4 z11 = tmp4 + tmp7; + Vector4 z12 = tmp4 - tmp7; + + tmp7 = z11 + z13; + tmp11 = (z11 - z13) * mm128_F_1_4142; + + z5 = (z10 + z12) * mm128_F_1_8477; + + tmp10 = (z12 * mm128_F_n1_0823) + z5; + tmp12 = (z10 * mm128_F_n2_6131) + z5; + + tmp6 = tmp12 - tmp7; + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 - tmp5; + + Unsafe.Add(ref vecRef, 0 * 2) = tmp0 + tmp7; + Unsafe.Add(ref vecRef, 7 * 2) = tmp0 - tmp7; + Unsafe.Add(ref vecRef, 1 * 2) = tmp1 + tmp6; + Unsafe.Add(ref vecRef, 6 * 2) = tmp1 - tmp6; + Unsafe.Add(ref vecRef, 2 * 2) = tmp2 + tmp5; + Unsafe.Add(ref vecRef, 5 * 2) = tmp2 - tmp5; + Unsafe.Add(ref vecRef, 3 * 2) = tmp3 + tmp4; + Unsafe.Add(ref vecRef, 4 * 2) = tmp3 - tmp4; } } @@ -120,8 +214,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c. /// - /// Input matrix. - private static void ForwardTransform_Scalar(ref Block8x8F block) + /// Input block. + private static void FDCT_Scalar(ref Block8x8F block) { const int dctSize = 8; @@ -130,17 +224,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components float z1, z2, z3, z4, z5, z11, z13; // First pass - process rows - ref float dataRef = ref Unsafe.As(ref block); + ref float blockRef = ref Unsafe.As(ref block); for (int ctr = 7; ctr >= 0; ctr--) { - tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7); - tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7); - tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6); - tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6); - tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5); - tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5); - tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4); - tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4); + tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7); + tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7); + tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6); + tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6); + tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5); + tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5); + tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4); + tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4); // Even part tmp10 = tmp0 + tmp3; @@ -148,12 +242,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11; - Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11; + Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11; + Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11; z1 = (tmp12 + tmp13) * 0.707106781f; - Unsafe.Add(ref dataRef, 2) = tmp13 + z1; - Unsafe.Add(ref dataRef, 6) = tmp13 - z1; + Unsafe.Add(ref blockRef, 2) = tmp13 + z1; + Unsafe.Add(ref blockRef, 6) = tmp13 - z1; // Odd part tmp10 = tmp4 + tmp5; @@ -168,26 +262,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components z11 = tmp7 + z3; z13 = tmp7 - z3; - Unsafe.Add(ref dataRef, 5) = z13 + z2; - Unsafe.Add(ref dataRef, 3) = z13 - z2; - Unsafe.Add(ref dataRef, 1) = z11 + z4; - Unsafe.Add(ref dataRef, 7) = z11 - z4; + Unsafe.Add(ref blockRef, 5) = z13 + z2; + Unsafe.Add(ref blockRef, 3) = z13 - z2; + Unsafe.Add(ref blockRef, 1) = z11 + z4; + Unsafe.Add(ref blockRef, 7) = z11 - z4; - dataRef = ref Unsafe.Add(ref dataRef, dctSize); + blockRef = ref Unsafe.Add(ref blockRef, dctSize); } // Second pass - process columns - dataRef = ref Unsafe.As(ref block); + blockRef = ref Unsafe.As(ref block); for (int ctr = 7; ctr >= 0; ctr--) { - tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7); - tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7); - tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6); - tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6); - tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5); - tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5); - tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4); - tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4); + tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7); + tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7); + tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6); + tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6); + tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5); + tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5); + tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4); + tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4); // Even part tmp10 = tmp0 + tmp3; @@ -195,12 +289,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11; - Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11; + Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11; + Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11; z1 = (tmp12 + tmp13) * 0.707106781f; - Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1; - Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1; + Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1; + Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1; // Odd part tmp10 = tmp4 + tmp5; @@ -215,12 +309,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components z11 = tmp7 + z3; z13 = tmp7 - z3; - Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2; - Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2; - Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4; - Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4; + Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2; + Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2; + Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4; + Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4; - dataRef = ref Unsafe.Add(ref dataRef, 1); + blockRef = ref Unsafe.Add(ref blockRef, 1); } } @@ -230,11 +324,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// This implementation must be called only if hardware supports 4 /// floating point numbers vector. Otherwise explicit scalar - /// implementation is faster - /// because it does not rely on matrix transposition. + /// implementation is faster + /// because it does not rely on block transposition. /// - /// Input matrix. - private static void ForwardTransform_Vector4(ref Block8x8F block) + /// Input block. + public static void FDCT_Vector4(ref Block8x8F block) { DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware."); @@ -247,209 +341,50 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components block.TransposeInplace(); FDCT8x4_Vector4(ref block.V0L); FDCT8x4_Vector4(ref block.V0R); - } - /// - /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix. - /// - /// - /// Implemented using Vector4 API operations for either scalar or sse hardware implementation. - /// Must be called on both 8x4 matrix parts for the full FDCT transform. - /// - /// Input reference to the first - private static void FDCT8x4_Vector4(ref Vector4 blockRef) - { - Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14); - Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14); - Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12); - Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12); - Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10); - Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10); - Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8); - Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8); - - // Even part - Vector4 tmp10 = tmp0 + tmp3; - Vector4 tmp13 = tmp0 - tmp3; - Vector4 tmp11 = tmp1 + tmp2; - Vector4 tmp12 = tmp1 - tmp2; - - Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11; - Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11; - - Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071; - Unsafe.Add(ref blockRef, 4) = tmp13 + z1; - Unsafe.Add(ref blockRef, 12) = tmp13 - z1; - - // Odd part - tmp10 = tmp4 + tmp5; - tmp11 = tmp5 + tmp6; - tmp12 = tmp6 + tmp7; - - Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826; - Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5; - Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5; - Vector4 z3 = tmp11 * mm128_F_0_7071; - - Vector4 z11 = tmp7 + z3; - Vector4 z13 = tmp7 - z3; - - Unsafe.Add(ref blockRef, 10) = z13 + z2; - Unsafe.Add(ref blockRef, 6) = z13 - z2; - Unsafe.Add(ref blockRef, 2) = z11 + z4; - Unsafe.Add(ref blockRef, 14) = z11 - z4; - } + // Applies 1D floating point FDCT inplace on 8x4 part of 8x8 block + static void FDCT8x4_Vector4(ref Vector4 vecRef) + { + Vector4 tmp0 = Unsafe.Add(ref vecRef, 0) + Unsafe.Add(ref vecRef, 14); + Vector4 tmp7 = Unsafe.Add(ref vecRef, 0) - Unsafe.Add(ref vecRef, 14); + Vector4 tmp1 = Unsafe.Add(ref vecRef, 2) + Unsafe.Add(ref vecRef, 12); + Vector4 tmp6 = Unsafe.Add(ref vecRef, 2) - Unsafe.Add(ref vecRef, 12); + Vector4 tmp2 = Unsafe.Add(ref vecRef, 4) + Unsafe.Add(ref vecRef, 10); + Vector4 tmp5 = Unsafe.Add(ref vecRef, 4) - Unsafe.Add(ref vecRef, 10); + Vector4 tmp3 = Unsafe.Add(ref vecRef, 6) + Unsafe.Add(ref vecRef, 8); + Vector4 tmp4 = Unsafe.Add(ref vecRef, 6) - Unsafe.Add(ref vecRef, 8); - /// - /// Apply floating point IDCT inplace. - /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. - /// - /// Input matrix. - /// Matrix to store temporal results. - public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp) - { - block.TransposeInplace(); - IDCT8x8(ref block, ref temp); - temp.TransposeInplace(); - IDCT8x8(ref temp, ref block); + // Even part + Vector4 tmp10 = tmp0 + tmp3; + Vector4 tmp13 = tmp0 - tmp3; + Vector4 tmp11 = tmp1 + tmp2; + Vector4 tmp12 = tmp1 - tmp2; - // TODO: This can be fused into quantization table step - block.MultiplyInPlace(C_0_125); - } + Unsafe.Add(ref vecRef, 0) = tmp10 + tmp11; + Unsafe.Add(ref vecRef, 8) = tmp10 - tmp11; - /// - /// Performs 8x8 matrix Inverse Discrete Cosine Transform - /// - /// Source - /// Destination - private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) - { - IDCT8x8_Avx(ref s, ref d); - } - else -#endif - { - IDCT8x4_LeftPart(ref s, ref d); - IDCT8x4_RightPart(ref s, ref d); - } - } + Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071; + Unsafe.Add(ref vecRef, 4) = tmp13 + z1; + Unsafe.Add(ref vecRef, 12) = tmp13 - z1; - /// - /// Do IDCT internal operations on the left part of the block. Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// Destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1L; - Vector4 my7 = s.V7L; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3L; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5L; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2L; - Vector4 my6 = s.V6L; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0L; - Vector4 my4 = s.V4L; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0L = my0 + mb0; - d.V7L = my0 - mb0; - d.V1L = my1 + mb1; - d.V6L = my1 - mb1; - d.V2L = my2 + mb2; - d.V5L = my2 - mb2; - d.V3L = my3 + mb3; - d.V4L = my3 - mb3; - } + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; - /// - /// Do IDCT internal operations on the right part of the block. - /// Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// The destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1R; - Vector4 my7 = s.V7R; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3R; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5R; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2R; - Vector4 my6 = s.V6R; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0R; - Vector4 my4 = s.V4R; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0R = my0 + mb0; - d.V7R = my0 - mb0; - d.V1R = my1 + mb1; - d.V6R = my1 - mb1; - d.V2R = my2 + mb2; - d.V5R = my2 - mb2; - d.V3R = my3 + mb3; - d.V4R = my3 - mb3; + Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826; + Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5; + Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5; + Vector4 z3 = tmp11 * mm128_F_0_7071; + + Vector4 z11 = tmp7 + z3; + Vector4 z13 = tmp7 - z3; + + Unsafe.Add(ref vecRef, 10) = z13 + z2; + Unsafe.Add(ref vecRef, 6) = z13 - z2; + Unsafe.Add(ref vecRef, 2) = z11 + z4; + Unsafe.Add(ref vecRef, 14) = z11 - z4; + } } } } From 22946934461af71b8e5bd0006608b1b3120f76f5 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 09:09:36 +0300 Subject: [PATCH 51/69] Fused transpose with zig-zag ordering --- .../Components/Decoder/HuffmanScanDecoder.cs | 10 +++++----- src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs | 16 ++++++++++++++++ src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs | 3 +++ .../ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs | 14 +++++++++++--- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs index bc9a53ea04..71f1689f5c 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs @@ -501,7 +501,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder { i += r; s = buffer.Receive(s); - Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s; + Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[i++]) = (short)s; } else { @@ -570,7 +570,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder if (s != 0) { s = buffer.Receive(s); - Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low); + Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[i]) = (short)(s << low); } else { @@ -646,7 +646,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder do { - ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]); + ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]); if (coef != 0) { buffer.CheckBits(); @@ -672,7 +672,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder if ((s != 0) && (k < 64)) { - Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s; + Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]) = (short)s; } } } @@ -681,7 +681,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder { for (; k <= end; k++) { - ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]); + ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]); if (coef != 0) { diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs index e519a8a1dc..f52cb6c105 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs @@ -35,5 +35,21 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }; + + public static ReadOnlySpan TransposingOrder => new byte[] + { + 0, 8, 1, 2, 9, 16, 24, 17, + 10, 3, 4, 11, 18, 25, 32, 40, + 33, 26, 19, 12, 5, 6, 13, 20, + 27, 34, 41, 48, 56, 49, 42, 35, + 28, 21, 14, 7, 15, 22, 29, 36, + 43, 50, 57, 58, 51, 44, 37, 30, + 23, 31, 38, 45, 52, 59, 60, 53, + 46, 39, 47, 54, 61, 62, 55, 63, + + // Extra entries for safety in decoder + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63 + }; } } diff --git a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs index 9a9e5eb799..73763f4ab8 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs @@ -942,6 +942,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg break; } } + + // Adjusting table for IDCT step during decompression + FastFloatingPointDCT.AdjustToIDCT(ref table); } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs index 39046438a8..b67ad85eea 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs @@ -1,6 +1,7 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using SixLabors.ImageSharp.Formats.Jpeg.Components; using Xunit; @@ -9,8 +10,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg [Trait("Format", "Jpg")] public class ZigZagTests { - [Fact] - public void ZigZagCanHandleAllPossibleCoefficients() + private static void CanHandleAllPossibleCoefficients(ReadOnlySpan order) { // Mimic the behaviour of the huffman scan decoder using all possible byte values short[] block = new short[64]; @@ -26,7 +26,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg if (s != 0) { i += r; - block[ZigZag.ZigZagOrder[i++]] = (short)s; + block[order[i++]] = (short)s; } else { @@ -40,5 +40,13 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg } } } + + [Fact] + public static void ZigZagCanHandleAllPossibleCoefficients() => + CanHandleAllPossibleCoefficients(ZigZag.ZigZagOrder); + + [Fact] + public static void TrasposingZigZagCanHandleAllPossibleCoefficients() => + CanHandleAllPossibleCoefficients(ZigZag.TransposingOrder); } } From eaa73732cf8a8a7b64354f7a8f2b81b814e644e0 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 09:10:44 +0300 Subject: [PATCH 52/69] Comments --- src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs index f52cb6c105..ab80b3ae67 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs @@ -36,6 +36,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 63, 63, 63, 63, 63, 63, 63, 63 }; + /// + /// Gets span of zig-zag with fused transpose step ordering indices. + /// + /// + /// When reading corrupted data, the Huffman decoders could attempt + /// to reference an entry beyond the end of this array (if the decoded + /// zero run length reaches past the end of the block). To prevent + /// wild stores without adding an inner-loop test, we put some extra + /// "63"s after the real entries. This will cause the extra coefficient + /// to be stored in location 63 of the block, not somewhere random. + /// The worst case would be a run-length of 15, which means we need 16 + /// fake entries. + /// public static ReadOnlySpan TransposingOrder => new byte[] { 0, 8, 1, 2, 9, 16, 24, 17, From 7c1f05b0a8aebad8a5515cb8c5c42780b0fc9a22 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 21 Nov 2021 11:28:02 +0100 Subject: [PATCH 53/69] Avoid allocating too many CostInterval objects --- .../Formats/Webp/Lossless/CostManager.cs | 37 ++++++++++++++----- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs b/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs index 94c7bd8470..213971764b 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs @@ -14,6 +14,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { private CostInterval head; + private const int FreeIntervalsStartCount = 25; + + private readonly Stack freeIntervals = new(FreeIntervalsStartCount); + public CostManager(ushort[] distArray, int pixCount, CostModel costModel) { int costCacheSize = pixCount > BackwardReferenceEncoder.MaxLength ? BackwardReferenceEncoder.MaxLength : pixCount; @@ -24,6 +28,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless this.DistArray = distArray; this.Count = 0; + for (int i = 0; i < FreeIntervalsStartCount; i++) + { + this.freeIntervals.Push(new CostInterval()); + } + // Fill in the cost cache. this.CacheIntervalsSize++; this.CostCache.Add(costModel.GetLengthCost(0)); @@ -201,10 +210,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless this.InsertInterval(interval, interval.Cost, interval.Index, end, endOriginal); break; } - else - { - interval.End = start; - } + + interval.End = start; } } @@ -226,6 +233,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless this.ConnectIntervals(interval.Previous, interval.Next); this.Count--; + + interval.Next = null; + interval.Previous = null; + this.freeIntervals.Push(interval); } private void InsertInterval(CostInterval intervalIn, float cost, int position, int start, int end) @@ -236,13 +247,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } // TODO: should we use COST_CACHE_INTERVAL_SIZE_MAX? - var intervalNew = new CostInterval() + CostInterval intervalNew; + if (this.freeIntervals.Count > 0) { - Cost = cost, - Start = start, - End = end, - Index = position - }; + intervalNew = this.freeIntervals.Pop(); + intervalNew.Cost = cost; + intervalNew.Start = start; + intervalNew.End = end; + intervalNew.Index = position; + } + else + { + intervalNew = new CostInterval() { Cost = cost, Start = start, End = end, Index = position }; + } this.PositionOrphanInterval(intervalNew, intervalIn); this.Count++; From b8925e1aaf231c416a3b0e77d2e66f46c45797f8 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 21 Nov 2021 12:33:24 +0100 Subject: [PATCH 54/69] CostsManager now uses MemoryAllocator --- .../Webp/Lossless/BackwardReferenceEncoder.cs | 32 ++++++----- .../Formats/Webp/Lossless/CostManager.cs | 57 ++++++++++++++----- 2 files changed, 61 insertions(+), 28 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs index 93f6372c64..82aa3ff7b1 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs @@ -2,6 +2,7 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Buffers; using System.Collections.Generic; using SixLabors.ImageSharp.Memory; @@ -102,7 +103,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless if ((lz77TypeBest == (int)Vp8LLz77Type.Lz77Standard || lz77TypeBest == (int)Vp8LLz77Type.Lz77Box) && quality >= 25) { Vp8LHashChain hashChainTmp = lz77TypeBest == (int)Vp8LLz77Type.Lz77Standard ? hashChain : hashChainBox; - BackwardReferencesTraceBackwards(width, height, bgra, cacheBits, hashChainTmp, best, worst); + BackwardReferencesTraceBackwards(width, height, memoryAllocator, bgra, cacheBits, hashChainTmp, best, worst); var histo = new Vp8LHistogram(worst, cacheBits); double bitCostTrace = histo.EstimateBits(stats, bitsEntropy); if (bitCostTrace < bitCostBest) @@ -236,6 +237,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static void BackwardReferencesTraceBackwards( int xSize, int ySize, + MemoryAllocator memoryAllocator, ReadOnlySpan bgra, int cacheBits, Vp8LHashChain hashChain, @@ -243,22 +245,24 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless Vp8LBackwardRefs refsDst) { int distArraySize = xSize * ySize; - ushort[] distArray = new ushort[distArraySize]; + using IMemoryOwner distArrayBuffer = memoryAllocator.Allocate(distArraySize); + Span distArray = distArrayBuffer.GetSpan(); - BackwardReferencesHashChainDistanceOnly(xSize, ySize, bgra, cacheBits, hashChain, refsSrc, distArray); + BackwardReferencesHashChainDistanceOnly(xSize, ySize, memoryAllocator, bgra, cacheBits, hashChain, refsSrc, distArrayBuffer); int chosenPathSize = TraceBackwards(distArray, distArraySize); - Span chosenPath = distArray.AsSpan(distArraySize - chosenPathSize); + Span chosenPath = distArray.Slice(distArraySize - chosenPathSize); BackwardReferencesHashChainFollowChosenPath(bgra, cacheBits, chosenPath, chosenPathSize, hashChain, refsDst); } private static void BackwardReferencesHashChainDistanceOnly( int xSize, int ySize, + MemoryAllocator memoryAllocator, ReadOnlySpan bgra, int cacheBits, Vp8LHashChain hashChain, Vp8LBackwardRefs refs, - ushort[] distArray) + IMemoryOwner distArrayBuffer) { int pixCount = xSize * ySize; bool useColorCache = cacheBits > 0; @@ -277,22 +281,24 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } costModel.Build(xSize, cacheBits, refs); - var costManager = new CostManager(distArray, pixCount, costModel); + var costManager = new CostManager(memoryAllocator, distArrayBuffer, pixCount, costModel); + Span costManagerCosts = costManager.Costs.GetSpan(); + Span distArray = distArrayBuffer.GetSpan(); // We loop one pixel at a time, but store all currently best points to non-processed locations from this point. distArray[0] = 0; // Add first pixel as literal. - AddSingleLiteralWithCostModel(bgra, colorCache, costModel, 0, useColorCache, 0.0f, costManager.Costs, distArray); + AddSingleLiteralWithCostModel(bgra, colorCache, costModel, 0, useColorCache, 0.0f, costManagerCosts, distArray); for (int i = 1; i < pixCount; i++) { - float prevCost = costManager.Costs[i - 1]; + float prevCost = costManagerCosts[i - 1]; int offset = hashChain.FindOffset(i); int len = hashChain.FindLength(i); // Try adding the pixel as a literal. - AddSingleLiteralWithCostModel(bgra, colorCache, costModel, i, useColorCache, prevCost, costManager.Costs, distArray); + AddSingleLiteralWithCostModel(bgra, colorCache, costModel, i, useColorCache, prevCost, costManagerCosts, distArray); // If we are dealing with a non-literal. if (len >= 2) @@ -336,7 +342,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless costManager.UpdateCostAtIndex(j - 1, false); costManager.UpdateCostAtIndex(j, false); - costManager.PushInterval(costManager.Costs[j - 1] + offsetCost, j, lenJ); + costManager.PushInterval(costManagerCosts[j - 1] + offsetCost, j, lenJ); reach = j + lenJ - 1; } } @@ -348,7 +354,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } } - private static int TraceBackwards(ushort[] distArray, int distArraySize) + private static int TraceBackwards(Span distArray, int distArraySize) { int chosenPathSize = 0; int pathPos = distArraySize; @@ -428,8 +434,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless int idx, bool useColorCache, float prevCost, - float[] cost, - ushort[] distArray) + Span cost, + Span distArray) { double costVal = prevCost; uint color = bgra[idx]; diff --git a/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs b/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs index 213971764b..3ee1021386 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs @@ -1,7 +1,10 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; +using System.Buffers; using System.Collections.Generic; +using SixLabors.ImageSharp.Memory; namespace SixLabors.ImageSharp.Formats.Webp.Lossless { @@ -10,21 +13,23 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// It caches the different CostCacheInterval, caches the different /// GetLengthCost(costModel, k) in costCache and the CostInterval's. /// - internal class CostManager + internal class CostManager : IDisposable { + private bool disposed; + private CostInterval head; private const int FreeIntervalsStartCount = 25; private readonly Stack freeIntervals = new(FreeIntervalsStartCount); - public CostManager(ushort[] distArray, int pixCount, CostModel costModel) + public CostManager(MemoryAllocator memoryAllocator, IMemoryOwner distArray, int pixCount, CostModel costModel) { int costCacheSize = pixCount > BackwardReferenceEncoder.MaxLength ? BackwardReferenceEncoder.MaxLength : pixCount; this.CacheIntervals = new List(); this.CostCache = new List(); - this.Costs = new float[pixCount]; + this.Costs = memoryAllocator.Allocate(pixCount); this.DistArray = distArray; this.Count = 0; @@ -73,10 +78,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } // Set the initial costs high for every pixel as we will keep the minimum. - for (int i = 0; i < pixCount; i++) - { - this.Costs[i] = 1e38f; - } + this.Costs.GetSpan().Fill(1e38f); } /// @@ -91,9 +93,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless public int CacheIntervalsSize { get; } - public float[] Costs { get; } + public IMemoryOwner Costs { get; } - public ushort[] DistArray { get; } + public IMemoryOwner DistArray { get; } public List CacheIntervals { get; } @@ -137,6 +139,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless // interval logic, just serialize it right away. This constant is empirical. int skipDistance = 10; + Span costs = this.Costs.GetSpan(); + Span distArray = this.DistArray.GetSpan(); if (len < skipDistance) { for (int j = position; j < position + len; j++) @@ -144,10 +148,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless int k = j - position; float costTmp = (float)(distanceCost + this.CostCache[k]); - if (this.Costs[j] > costTmp) + if (costs[j] > costTmp) { - this.Costs[j] = costTmp; - this.DistArray[j] = (ushort)(k + 1); + costs[j] = costTmp; + distArray[j] = (ushort)(k + 1); } } @@ -314,12 +318,35 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// private void UpdateCost(int i, int position, float cost) { + Span costs = this.Costs.GetSpan(); + Span distArray = this.DistArray.GetSpan(); int k = i - position; - if (this.Costs[i] > cost) + if (costs[i] > cost) + { + costs[i] = cost; + distArray[i] = (ushort)(k + 1); + } + } + + protected virtual void Dispose(bool disposing) + { + if (!this.disposed) { - this.Costs[i] = cost; - this.DistArray[i] = (ushort)(k + 1); + if (disposing) + { + this.Costs.Dispose(); + } + + this.disposed = true; } } + + /// + public void Dispose() + { + // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method + this.Dispose(disposing: true); + GC.SuppressFinalize(this); + } } } From e03709d7b6d80ea0a1681cecfe92842a9ac39db6 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 21 Nov 2021 13:18:43 +0100 Subject: [PATCH 55/69] Make StorageOrder bytes a ReadOnlySpan --- .../Formats/Webp/Lossless/Vp8LEncoder.cs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs index 48f7d0e2b7..8ca80d5ab4 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs @@ -137,6 +137,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } } + // RFC 1951 will calm you down if you are worried about this funny sequence. + // This sequence is tuned from that, but more weighted for lower symbol count, + // and more spiking histograms. + // This uses C#'s compiler optimization to refer to assembly's static data directly. + private static ReadOnlySpan StorageOrder => new byte[] { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + // This uses C#'s compiler optimization to refer to assembly's static data directly. private static ReadOnlySpan Order => new byte[] { 1, 2, 0, 3 }; @@ -942,16 +948,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private void StoreHuffmanTreeOfHuffmanTreeToBitMask(byte[] codeLengthBitDepth) { - // RFC 1951 will calm you down if you are worried about this funny sequence. - // This sequence is tuned from that, but more weighted for lower symbol count, - // and more spiking histograms. - byte[] storageOrder = { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; - // Throw away trailing zeros: int codesToStore = WebpConstants.CodeLengthCodes; for (; codesToStore > 4; codesToStore--) { - if (codeLengthBitDepth[storageOrder[codesToStore - 1]] != 0) + if (codeLengthBitDepth[StorageOrder[codesToStore - 1]] != 0) { break; } @@ -960,7 +961,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless this.bitWriter.PutBits((uint)codesToStore - 4, 4); for (int i = 0; i < codesToStore; i++) { - this.bitWriter.PutBits(codeLengthBitDepth[storageOrder[i]], 3); + this.bitWriter.PutBits(codeLengthBitDepth[StorageOrder[i]], 3); } } From 92ac52221cf2a34b20b5c0aaca8124b522dba449 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 21 Nov 2021 13:58:34 +0100 Subject: [PATCH 56/69] Remove not needed DeepClone --- src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs | 4 +--- src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs b/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs index 0376311ed9..07fec7f990 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs @@ -9,7 +9,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// Represents the Huffman tree. /// [DebuggerDisplay("TotalCount = {TotalCount}, Value = {Value}, Left = {PoolIndexLeft}, Right = {PoolIndexRight}")] - internal struct HuffmanTree : IDeepCloneable + internal struct HuffmanTree { /// /// Initializes a new instance of the struct. @@ -57,7 +57,5 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return t1.Value < t2.Value ? -1 : 1; } - - public IDeepCloneable DeepClone() => new HuffmanTree(this); } } diff --git a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs index 66170b85fd..56f2ee9cef 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs @@ -219,8 +219,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless while (treeSize > 1) { // Finish when we have only one root. - treePool[treePoolSize++] = (HuffmanTree)tree[treeSize - 1].DeepClone(); - treePool[treePoolSize++] = (HuffmanTree)tree[treeSize - 2].DeepClone(); + treePool[treePoolSize++] = tree[treeSize - 1]; + treePool[treePoolSize++] = tree[treeSize - 2]; int count = treePool[treePoolSize - 1].TotalCount + treePool[treePoolSize - 2].TotalCount; treeSize -= 2; @@ -239,7 +239,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless int startIdx = endIdx + num - 1; for (int i = startIdx; i >= endIdx; i--) { - tree[i] = (HuffmanTree)tree[i - 1].DeepClone(); + tree[i] = tree[i - 1]; } tree[k].TotalCount = count; From d0382bbb03c1c4f7c1447bcd46d8378d85d7b314 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 21 Nov 2021 14:51:12 +0100 Subject: [PATCH 57/69] Change PixOrCopyMode, HistoIx and EntropyIx enums to be a byte --- src/ImageSharp/Formats/Webp/EntropyIx.cs | 2 +- src/ImageSharp/Formats/Webp/HistoIx.cs | 2 +- src/ImageSharp/Formats/Webp/Lossless/PixOrCopyMode.cs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/EntropyIx.cs b/src/ImageSharp/Formats/Webp/EntropyIx.cs index c72ddeb42d..98e8b7e164 100644 --- a/src/ImageSharp/Formats/Webp/EntropyIx.cs +++ b/src/ImageSharp/Formats/Webp/EntropyIx.cs @@ -6,7 +6,7 @@ namespace SixLabors.ImageSharp.Formats.Webp /// /// These five modes are evaluated and their respective entropy is computed. /// - internal enum EntropyIx + internal enum EntropyIx : byte { Direct = 0, diff --git a/src/ImageSharp/Formats/Webp/HistoIx.cs b/src/ImageSharp/Formats/Webp/HistoIx.cs index 68b00394b0..83522f9da8 100644 --- a/src/ImageSharp/Formats/Webp/HistoIx.cs +++ b/src/ImageSharp/Formats/Webp/HistoIx.cs @@ -3,7 +3,7 @@ namespace SixLabors.ImageSharp.Formats.Webp { - internal enum HistoIx + internal enum HistoIx : byte { HistoAlpha = 0, diff --git a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopyMode.cs b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopyMode.cs index 0d7023ffc2..26099b9023 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopyMode.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopyMode.cs @@ -3,7 +3,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { - internal enum PixOrCopyMode + internal enum PixOrCopyMode : byte { Literal, From e011450a90380015893f12f1f5d27aecd8e021ee Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 21 Nov 2021 15:15:24 +0100 Subject: [PATCH 58/69] Dispose cost manager and hashChainBox --- .../Formats/Webp/Lossless/BackwardReferenceEncoder.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs index 82aa3ff7b1..c394a8caa8 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs @@ -114,6 +114,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless BackwardReferences2DLocality(width, best); + hashChainBox?.Dispose(); + return best; } @@ -281,7 +283,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } costModel.Build(xSize, cacheBits, refs); - var costManager = new CostManager(memoryAllocator, distArrayBuffer, pixCount, costModel); + using var costManager = new CostManager(memoryAllocator, distArrayBuffer, pixCount, costModel); Span costManagerCosts = costManager.Costs.GetSpan(); Span distArray = distArrayBuffer.GetSpan(); From fae8f0dc319c4c46fa73e7412e9b5db6b58a9e93 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 21 Nov 2021 16:17:29 +0100 Subject: [PATCH 59/69] Initialize backward refs with the pixel count --- src/ImageSharp/Formats/Webp/Lossless/Vp8LBackwardRefs.cs | 2 +- src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LBackwardRefs.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LBackwardRefs.cs index 502728b15f..fca4ec59f6 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LBackwardRefs.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LBackwardRefs.cs @@ -7,7 +7,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless { internal class Vp8LBackwardRefs { - public Vp8LBackwardRefs() => this.Refs = new List(); + public Vp8LBackwardRefs(int pixels) => this.Refs = new List(pixels); /// /// Gets or sets the common block-size. diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs index 8ca80d5ab4..adabd0ac3f 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs @@ -130,7 +130,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless int refsBlockSize = ((pixelCount - 1) / MaxRefsBlockPerImage) + 1; for (int i = 0; i < this.Refs.Length; i++) { - this.Refs[i] = new Vp8LBackwardRefs + this.Refs[i] = new Vp8LBackwardRefs(pixelCount) { BlockSize = refsBlockSize < MinBlockSize ? MinBlockSize : refsBlockSize }; From af0b8ac3dd10e7dadd88a4baa84ab298ef2c4b59 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 21:46:11 +0300 Subject: [PATCH 60/69] Fixed compilation errors, fixed tests --- .../Decoder/JpegBlockPostProcessor.cs | 8 +- .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs | 211 ++++++++---------- 2 files changed, 92 insertions(+), 127 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs index 085cd4a291..15f212b400 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs @@ -18,11 +18,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder /// public Block8x8F SourceBlock; - /// - /// Temporal block to store intermediate computation results. - /// - public Block8x8F WorkspaceBlock; - /// /// The quantization table as . /// @@ -45,7 +40,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder this.subSamplingDivisors = component.SubSamplingDivisors; this.SourceBlock = default; - this.WorkspaceBlock = default; } /// @@ -71,7 +65,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder // Dequantize: block.MultiplyInPlace(ref this.DequantiazationTable); - FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock); + FastFloatingPointDCT.TransformIDCT(ref block); // To conform better to libjpeg we actually NEED TO loose precision here. // This is because they store blocks as Int16 between all the operations. diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs index 0a49d20cd4..3a6eb4f8bf 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs @@ -2,9 +2,6 @@ // Licensed under the Apache License, Version 2.0. using System; -#if SUPPORTS_RUNTIME_INTRINSICS -using System.Runtime.Intrinsics.X86; -#endif using SixLabors.ImageSharp.Formats.Jpeg.Components; using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils; using SixLabors.ImageSharp.Tests.TestUtilities; @@ -17,6 +14,20 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg [Trait("Format", "Jpg")] public static class DCTTests { + private const int MaxAllowedValue = short.MaxValue; + private const int MinAllowedValue = short.MinValue; + + internal static Block8x8F CreateBlockFromScalar(float value) + { + Block8x8F result = default; + for (int i = 0; i < Block8x8F.Size; i++) + { + result[i] = value; + } + + return result; + } + public class FastFloatingPoint : JpegFixture { public FastFloatingPoint(ITestOutputHelper output) @@ -24,130 +35,77 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { } - // Reference tests [Theory] [InlineData(1)] [InlineData(2)] [InlineData(3)] public void LLM_TransformIDCT_CompareToNonOptimized(int seed) { - float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); + float[] sourceArray = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed); var srcBlock = Block8x8F.Load(sourceArray); + // reference Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock); - var temp = default(Block8x8F); - FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp); - - this.CompareBlocks(expected, srcBlock, 1f); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - [InlineData(3)] - public void LLM_TransformIDCT_CompareToAccurate(int seed) - { - float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); + // testee + // Part of the IDCT calculations is fused into the quantization step + // We must multiply input block with adjusted no-quantization matrix + // before applying IDCT + Block8x8F dequantMatrix = CreateBlockFromScalar(1); - var srcBlock = Block8x8F.Load(sourceArray); + // Dequantization using unit matrix - no values are upscaled + // as quant matrix is all 1's + // This step is needed to apply adjusting multipliers to the input block + FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix); + srcBlock.MultiplyInPlace(ref dequantMatrix); - Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock); + // IDCT implementation tranforms blocks after transposition + srcBlock.TransposeInplace(); - var temp = default(Block8x8F); - FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp); + // IDCT calculation + FastFloatingPointDCT.TransformIDCT(ref srcBlock); this.CompareBlocks(expected, srcBlock, 1f); } - // Inverse transform - [Theory] - [InlineData(1)] - [InlineData(2)] - public void IDCT8x4_LeftPart(int seed) - { - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); - - var expectedDest = new float[64]; - - // reference - ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest); - - // testee - FastFloatingPointDCT.IDCT8x4_LeftPart(ref srcBlock, ref destBlock); - - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); - } - [Theory] [InlineData(1)] [InlineData(2)] - public void IDCT8x4_RightPart(int seed) + [InlineData(3)] + public void LLM_TransformIDCT_CompareToAccurate(int seed) { - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); + float[] sourceArray = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed); - var destBlock = default(Block8x8F); - - var expectedDest = new float[64]; + var srcBlock = Block8x8F.Load(sourceArray); // reference - ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); + Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock); // testee - FastFloatingPointDCT.IDCT8x4_RightPart(ref srcBlock, ref destBlock); - - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void IDCT8x8_Avx(int seed) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (!Avx.IsSupported) - { - this.Output.WriteLine("No AVX present, skipping test!"); - return; - } - - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - Block8x8F srcBlock = default; - srcBlock.LoadFrom(src); + // Part of the IDCT calculations is fused into the quantization step + // We must multiply input block with adjusted no-quantization matrix + // before applying IDCT + Block8x8F dequantMatrix = CreateBlockFromScalar(1); - Block8x8F destBlock = default; + // Dequantization using unit matrix - no values are upscaled + // as quant matrix is all 1's + // This step is needed to apply adjusting multipliers to the input block + FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix); + srcBlock.MultiplyInPlace(ref dequantMatrix); - float[] expectedDest = new float[64]; + // IDCT implementation tranforms blocks after transposition + srcBlock.TransposeInplace(); - // reference, left part - ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest); + // IDCT calculation + FastFloatingPointDCT.TransformIDCT(ref srcBlock); - // reference, right part - ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); - - // testee, whole 8x8 - FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock); - - float[] actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); -#endif + this.CompareBlocks(expected, srcBlock, 1f); } + // Inverse transform + // This test covers entire IDCT conversion chain + // This test checks all hardware implementations [Theory] [InlineData(1)] [InlineData(2)] @@ -157,41 +115,53 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { int seed = FeatureTestRunner.Deserialize(serialized); - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); + Span src = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed); var srcBlock = default(Block8x8F); srcBlock.LoadFrom(src); - var expectedDest = new float[64]; - var temp1 = new float[64]; - var temp2 = default(Block8x8F); + float[] expectedDest = new float[64]; + float[] temp = new float[64]; // reference - ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1); + ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp); // testee - FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2); + // Part of the IDCT calculations is fused into the quantization step + // We must multiply input block with adjusted no-quantization matrix + // before applying IDCT + Block8x8F dequantMatrix = CreateBlockFromScalar(1); + + // Dequantization using unit matrix - no values are upscaled + // as quant matrix is all 1's + // This step is needed to apply adjusting multipliers to the input block + FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix); + srcBlock.MultiplyInPlace(ref dequantMatrix); + + // IDCT implementation tranforms blocks after transposition + srcBlock.TransposeInplace(); - var actualDest = new float[64]; - srcBlock.ScaledCopyTo(actualDest); + // IDCT calculation + FastFloatingPointDCT.TransformIDCT(ref srcBlock); + + float[] actualDest = srcBlock.ToArray(); Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); } - // 3 paths: + // 4 paths: // 1. AllowAll - call avx/fma implementation - // 2. DisableFMA - call avx implementation without fma acceleration - // 3. DisableAvx - call fallback code of Vector4 implementation - // - // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result + // 2. DisableFMA - call avx without fma implementation + // 3. DisableAvx - call sse Vector4 implementation + // 4. DisableHWIntrinsic - call scalar fallback implementation FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, seed, - HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX); + HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic); } // Forward transform - // This test covers entire FDCT conversions chain - // This test checks all implementations: intrinsic and scalar fallback + // This test covers entire FDCT conversion chain + // This test checks all hardware implementations [Theory] [InlineData(1)] [InlineData(2)] @@ -201,7 +171,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { int seed = FeatureTestRunner.Deserialize(serialized); - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); + Span src = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed); var block = default(Block8x8F); block.LoadFrom(src); @@ -212,23 +182,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true); // testee - // Part of the FDCT calculations is fused into the quantization step - // We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen FastFloatingPointDCT.TransformFDCT(ref block); - for (int i = 0; i < 64; i++) - { - block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i]; - } + + // Part of the IDCT calculations is fused into the quantization step + // We must multiply input block with adjusted no-quantization matrix + // after applying FDCT + Block8x8F quantMatrix = CreateBlockFromScalar(1); + FastFloatingPointDCT.AdjustToFDCT(ref quantMatrix); + block.MultiplyInPlace(ref quantMatrix); float[] actualDest = block.ToArray(); Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f)); } - // 3 paths: + // 4 paths: // 1. AllowAll - call avx/fma implementation - // 2. DisableFMA - call avx implementation without fma acceleration - // 3. DisableAvx - call sse implementation + // 2. DisableFMA - call avx without fma implementation + // 3. DisableAvx - call sse Vector4 implementation // 4. DisableHWIntrinsic - call scalar fallback implementation FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, From 1631e9d0c7c8be82c99525fcc308d85628d3ff47 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 21 Nov 2021 22:03:37 +0300 Subject: [PATCH 61/69] Updated benchmark results --- tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs index cb89f90829..102ba8dd3c 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs @@ -66,16 +66,17 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg /* -BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1288 (20H2/October2020Update) +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1348 (20H2/October2020Update) Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores .NET SDK=6.0.100-preview.3.21202.5 [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT DefaultJob : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + | Method | Mean | Error | StdDev | |------------------------------------ |----------:|----------:|----------:| -| 'Baseline 4:4:4 Interleaved' | 11.781 ms | 0.0737 ms | 0.0654 ms | -| 'Baseline 4:2:0 Interleaved' | 8.688 ms | 0.0345 ms | 0.0306 ms | -| 'Baseline 4:0:0 (grayscale)' | 1.643 ms | 0.0092 ms | 0.0086 ms | -| 'Progressive 4:2:0 Non-Interleaved' | 13.770 ms | 0.0928 ms | 0.0823 ms | +| 'Baseline 4:4:4 Interleaved' | 11.532 ms | 0.0530 ms | 0.0496 ms | +| 'Baseline 4:2:0 Interleaved' | 8.458 ms | 0.0289 ms | 0.0256 ms | +| 'Baseline 4:0:0 (grayscale)' | 1.550 ms | 0.0050 ms | 0.0044 ms | +| 'Progressive 4:2:0 Non-Interleaved' | 13.220 ms | 0.0449 ms | 0.0398 ms | */ From 50b0f0b07f23472fe3093c19ba0fd98a3921639f Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 23 Nov 2021 01:21:03 +0300 Subject: [PATCH 62/69] Test qol fixes --- tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs | 10 ++++------ .../ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs index 3a6eb4f8bf..85f30d28d7 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs @@ -52,16 +52,15 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg // Part of the IDCT calculations is fused into the quantization step // We must multiply input block with adjusted no-quantization matrix // before applying IDCT + // Dequantization using unit matrix - no values are upscaled Block8x8F dequantMatrix = CreateBlockFromScalar(1); - // Dequantization using unit matrix - no values are upscaled - // as quant matrix is all 1's // This step is needed to apply adjusting multipliers to the input block FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix); - srcBlock.MultiplyInPlace(ref dequantMatrix); // IDCT implementation tranforms blocks after transposition srcBlock.TransposeInplace(); + srcBlock.MultiplyInPlace(ref dequantMatrix); // IDCT calculation FastFloatingPointDCT.TransformIDCT(ref srcBlock); @@ -86,16 +85,15 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg // Part of the IDCT calculations is fused into the quantization step // We must multiply input block with adjusted no-quantization matrix // before applying IDCT + // Dequantization using unit matrix - no values are upscaled Block8x8F dequantMatrix = CreateBlockFromScalar(1); - // Dequantization using unit matrix - no values are upscaled - // as quant matrix is all 1's // This step is needed to apply adjusting multipliers to the input block FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix); - srcBlock.MultiplyInPlace(ref dequantMatrix); // IDCT implementation tranforms blocks after transposition srcBlock.TransposeInplace(); + srcBlock.MultiplyInPlace(ref dequantMatrix); // IDCT calculation FastFloatingPointDCT.TransformIDCT(ref srcBlock); diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs index a76b2bf2ef..1bdfc6ecad 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs @@ -172,7 +172,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils bool failed = false; - for (int i = 0; i < 64; i++) + for (int i = 0; i < Block8x8F.Size; i++) { float expected = a[i]; float actual = b[i]; From cf4cf239c71793100ed006541a09bd042e797c5a Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 23 Nov 2021 01:42:36 +0300 Subject: [PATCH 63/69] Shared infrastructure subproject commit hash --- shared-infrastructure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared-infrastructure b/shared-infrastructure index 33cb12ca77..a042aba176 160000 --- a/shared-infrastructure +++ b/shared-infrastructure @@ -1 +1 @@ -Subproject commit 33cb12ca77f919b44de56f344d2627cc2a108c3a +Subproject commit a042aba176cdb840d800c6ed4cfe41a54fb7b1e3 From 3069dd18de030544207100b3012ec009787dab3f Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 23 Nov 2021 05:48:04 +0300 Subject: [PATCH 64/69] Fixed failing tests --- .../Formats/Jpeg/Components/Block8x8.cs | 58 +++++++++++++++++++ .../Jpg/Utils/LibJpegTools.ComponentData.cs | 15 ++--- 2 files changed, 66 insertions(+), 7 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index 9d49b8c45f..27bb2fc3cd 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -337,6 +337,64 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } } + /// + /// Transpose the block inplace. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public void TransposeInplace() + { + ref short elemRef = ref Unsafe.As(ref this); + + // row #0 + Swap(ref Unsafe.Add(ref elemRef, 1), ref Unsafe.Add(ref elemRef, 8)); + Swap(ref Unsafe.Add(ref elemRef, 2), ref Unsafe.Add(ref elemRef, 16)); + Swap(ref Unsafe.Add(ref elemRef, 3), ref Unsafe.Add(ref elemRef, 24)); + Swap(ref Unsafe.Add(ref elemRef, 4), ref Unsafe.Add(ref elemRef, 32)); + Swap(ref Unsafe.Add(ref elemRef, 5), ref Unsafe.Add(ref elemRef, 40)); + Swap(ref Unsafe.Add(ref elemRef, 6), ref Unsafe.Add(ref elemRef, 48)); + Swap(ref Unsafe.Add(ref elemRef, 7), ref Unsafe.Add(ref elemRef, 56)); + + // row #1 + Swap(ref Unsafe.Add(ref elemRef, 10), ref Unsafe.Add(ref elemRef, 17)); + Swap(ref Unsafe.Add(ref elemRef, 11), ref Unsafe.Add(ref elemRef, 25)); + Swap(ref Unsafe.Add(ref elemRef, 12), ref Unsafe.Add(ref elemRef, 33)); + Swap(ref Unsafe.Add(ref elemRef, 13), ref Unsafe.Add(ref elemRef, 41)); + Swap(ref Unsafe.Add(ref elemRef, 14), ref Unsafe.Add(ref elemRef, 49)); + Swap(ref Unsafe.Add(ref elemRef, 15), ref Unsafe.Add(ref elemRef, 57)); + + // row #2 + Swap(ref Unsafe.Add(ref elemRef, 19), ref Unsafe.Add(ref elemRef, 26)); + Swap(ref Unsafe.Add(ref elemRef, 20), ref Unsafe.Add(ref elemRef, 34)); + Swap(ref Unsafe.Add(ref elemRef, 21), ref Unsafe.Add(ref elemRef, 42)); + Swap(ref Unsafe.Add(ref elemRef, 22), ref Unsafe.Add(ref elemRef, 50)); + Swap(ref Unsafe.Add(ref elemRef, 23), ref Unsafe.Add(ref elemRef, 58)); + + // row #3 + Swap(ref Unsafe.Add(ref elemRef, 28), ref Unsafe.Add(ref elemRef, 35)); + Swap(ref Unsafe.Add(ref elemRef, 29), ref Unsafe.Add(ref elemRef, 43)); + Swap(ref Unsafe.Add(ref elemRef, 30), ref Unsafe.Add(ref elemRef, 51)); + Swap(ref Unsafe.Add(ref elemRef, 31), ref Unsafe.Add(ref elemRef, 59)); + + // row #4 + Swap(ref Unsafe.Add(ref elemRef, 37), ref Unsafe.Add(ref elemRef, 44)); + Swap(ref Unsafe.Add(ref elemRef, 38), ref Unsafe.Add(ref elemRef, 52)); + Swap(ref Unsafe.Add(ref elemRef, 39), ref Unsafe.Add(ref elemRef, 60)); + + // row #5 + Swap(ref Unsafe.Add(ref elemRef, 46), ref Unsafe.Add(ref elemRef, 53)); + Swap(ref Unsafe.Add(ref elemRef, 47), ref Unsafe.Add(ref elemRef, 61)); + + // row #6 + Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62)); + + static void Swap(ref short a, ref short b) + { + short tmp = a; + a = b; + b = tmp; + } + } + /// /// Calculate the total sum of absolute differences of elements in 'a' and 'b'. /// diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs index adbd695c0e..5c00b39af8 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs @@ -48,6 +48,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils public short MaxVal { get; private set; } = short.MinValue; + internal void MakeBlock(Block8x8 block, int y, int x) + { + block.TransposeInplace(); + this.MakeBlock(block.ToArray(), y, x); + } + internal void MakeBlock(short[] data, int y, int x) { this.MinVal = Math.Min(this.MinVal, data.Min()); @@ -66,11 +72,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils Span blockRow = data.GetRowSpan(y - startIndex); for (int x = 0; x < this.WidthInBlocks; x++) { - short[] block = blockRow[x].ToArray(); - - // x coordinate stays the same - we load entire stride - // y coordinate is tricky as we load single stride to full buffer - offset is needed - this.MakeBlock(block, y, x); + this.MakeBlock(blockRow[x], y, x); } } } @@ -83,8 +85,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils Span blockRow = data.GetRowSpan(y); for (int x = 0; x < this.WidthInBlocks; x++) { - short[] block = blockRow[x].ToArray(); - this.MakeBlock(block, y, x); + this.MakeBlock(blockRow[x], y, x); } } } From 42315dcd80d7c9ed4708ff9b0b4b31b3e7794427 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 23 Nov 2021 06:00:43 +0300 Subject: [PATCH 65/69] Updated benchmarks --- tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs index 102ba8dd3c..9665ca42d6 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs @@ -75,7 +75,7 @@ Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores | Method | Mean | Error | StdDev | |------------------------------------ |----------:|----------:|----------:| -| 'Baseline 4:4:4 Interleaved' | 11.532 ms | 0.0530 ms | 0.0496 ms | +| 'Baseline 4:4:4 Interleaved' | 11.127 ms | 0.0659 ms | 0.0550 ms | | 'Baseline 4:2:0 Interleaved' | 8.458 ms | 0.0289 ms | 0.0256 ms | | 'Baseline 4:0:0 (grayscale)' | 1.550 ms | 0.0050 ms | 0.0044 ms | | 'Progressive 4:2:0 Non-Interleaved' | 13.220 ms | 0.0449 ms | 0.0398 ms | From 14508498d9f0d709e31d4d4e4e44a740cb3353c2 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 09:00:33 +0100 Subject: [PATCH 66/69] Seal classes and avoid calling SuppressFinalize --- .../Formats/Webp/Lossless/CostManager.cs | 24 ++----------------- .../Formats/Webp/Lossless/PixOrCopy.cs | 2 +- .../Formats/Webp/Lossless/Vp8LHashChain.cs | 23 ++---------------- 3 files changed, 5 insertions(+), 44 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs b/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs index 3ee1021386..c121a41a1a 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs @@ -13,10 +13,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// It caches the different CostCacheInterval, caches the different /// GetLengthCost(costModel, k) in costCache and the CostInterval's. /// - internal class CostManager : IDisposable + internal sealed class CostManager : IDisposable { - private bool disposed; - private CostInterval head; private const int FreeIntervalsStartCount = 25; @@ -328,25 +326,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless } } - protected virtual void Dispose(bool disposing) - { - if (!this.disposed) - { - if (disposing) - { - this.Costs.Dispose(); - } - - this.disposed = true; - } - } - /// - public void Dispose() - { - // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method - this.Dispose(disposing: true); - GC.SuppressFinalize(this); - } + public void Dispose() => this.Costs.Dispose(); } } diff --git a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs index 6cd109121d..96cdc3cbc5 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs @@ -6,7 +6,7 @@ using System.Diagnostics; namespace SixLabors.ImageSharp.Formats.Webp.Lossless { [DebuggerDisplay("Mode: {Mode}, Len: {Len}, BgraOrDistance: {BgraOrDistance}")] - internal class PixOrCopy + internal sealed class PixOrCopy { public PixOrCopyMode Mode { get; set; } diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs index 2aa35e392e..1bc7613a90 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs @@ -8,7 +8,7 @@ using SixLabors.ImageSharp.Memory; namespace SixLabors.ImageSharp.Formats.Webp.Lossless { - internal class Vp8LHashChain : IDisposable + internal sealed class Vp8LHashChain : IDisposable { private const uint HashMultiplierHi = 0xc6a4a793u; @@ -30,8 +30,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private readonly MemoryAllocator memoryAllocator; - private bool disposed; - /// /// Initializes a new instance of the class. /// @@ -288,24 +286,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless return maxWindowSize > WindowSize ? WindowSize : maxWindowSize; } - protected virtual void Dispose(bool disposing) - { - if (!this.disposed) - { - if (disposing) - { - this.OffsetLength.Dispose(); - } - - this.disposed = true; - } - } - /// - public void Dispose() - { - this.Dispose(disposing: true); - GC.SuppressFinalize(this); - } + public void Dispose() => this.OffsetLength.Dispose(); } } From 92445c35812ca66a3c7ac6cf9616ea7d89159a9e Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 23 Nov 2021 19:13:58 +0300 Subject: [PATCH 67/69] Added test for TransposeInplace --- .../Formats/Jpg/Block8x8FTests.cs | 5 +++- .../Formats/Jpg/Block8x8Tests.cs | 26 +++++++++++++++++++ .../Jpg/Utils/ReferenceImplementations.cs | 17 ++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index e5dc0ba01f..8f5f10f195 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -183,9 +183,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Assert.Equal(expected, actual); } + // This method has only 2 implementations: + // 1. AVX + // 2. Scalar FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic); + HwIntrinsics.AllowAll | HwIntrinsics.DisableHWIntrinsic); } private static float[] Create8x8ColorCropTestData() diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs index 3737cce804..b13a196cb9 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs @@ -276,5 +276,31 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg seed, HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); } + + [Fact] + public void TransposeInplace() + { + static void RunTest() + { + short[] expected = Create8x8ShortData(); + ReferenceImplementations.Transpose8x8(expected); + + var block8x8 = default(Block8x8); + block8x8.LoadFrom(Create8x8ShortData()); + + block8x8.TransposeInplace(); + + short[] actual = new short[64]; + block8x8.CopyTo(actual); + + Assert.Equal(expected, actual); + } + + // This method has only 1 implementation: + // 1. Scalar + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.DisableHWIntrinsic); + } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs index c9741521c6..8dc1c83d45 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs @@ -40,6 +40,23 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils } } + /// + /// Transpose 8x8 block stored linearly in a (inplace) + /// + internal static void Transpose8x8(Span data) + { + for (int i = 1; i < 8; i++) + { + int i8 = i * 8; + for (int j = 0; j < i; j++) + { + short tmp = data[i8 + j]; + data[i8 + j] = data[(j * 8) + i]; + data[(j * 8) + i] = tmp; + } + } + } + /// /// Transpose 8x8 block stored linearly in a /// From 28147059b43a197ffcc8b4b5cb692adc22975e8f Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 23 Nov 2021 19:42:27 +0300 Subject: [PATCH 68/69] Removed bound checks from DCT adjustment methods --- .../Jpeg/Components/FastFloatingPointDCT.cs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index a1c03e65c0..770093195e 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -3,6 +3,7 @@ using System.Numerics; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics.X86; #endif @@ -65,9 +66,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Quantization table to adjust. public static void AdjustToIDCT(ref Block8x8F quantTable) { - for (int i = 0; i < Block8x8F.Size; i++) + ref float tableRef = ref Unsafe.As(ref quantTable); + ref float multipliersRef = ref MemoryMarshal.GetReference(AdjustmentCoefficients); + for (nint i = 0; i < Block8x8F.Size; i++) { - quantTable[i] = quantTable[i] * AdjustmentCoefficients[i] * 0.125f; + tableRef = ref Unsafe.Add(ref tableRef, i); + tableRef = 0.125f * tableRef * Unsafe.Add(ref multipliersRef, i); } // Spectral macroblocks are transposed before quantization @@ -81,9 +85,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Quantization table to adjust. public static void AdjustToFDCT(ref Block8x8F quantTable) { - for (int i = 0; i < Block8x8F.Size; i++) + ref float tableRef = ref Unsafe.As(ref quantTable); + ref float multipliersRef = ref MemoryMarshal.GetReference(AdjustmentCoefficients); + for (nint i = 0; i < Block8x8F.Size; i++) { - quantTable[i] = 0.125f / (quantTable[i] * AdjustmentCoefficients[i]); + tableRef = ref Unsafe.Add(ref tableRef, i); + tableRef = 0.125f / (tableRef * Unsafe.Add(ref multipliersRef, i)); } } From 586df2e7b136745736884072a49ec797715d5feb Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 23 Nov 2021 21:49:27 +0300 Subject: [PATCH 69/69] Fixed crash bug --- .../Formats/Jpeg/Components/FastFloatingPointDCT.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 770093195e..81bfe2135d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -70,8 +70,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components ref float multipliersRef = ref MemoryMarshal.GetReference(AdjustmentCoefficients); for (nint i = 0; i < Block8x8F.Size; i++) { - tableRef = ref Unsafe.Add(ref tableRef, i); tableRef = 0.125f * tableRef * Unsafe.Add(ref multipliersRef, i); + tableRef = ref Unsafe.Add(ref tableRef, 1); } // Spectral macroblocks are transposed before quantization @@ -89,8 +89,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components ref float multipliersRef = ref MemoryMarshal.GetReference(AdjustmentCoefficients); for (nint i = 0; i < Block8x8F.Size; i++) { - tableRef = ref Unsafe.Add(ref tableRef, i); tableRef = 0.125f / (tableRef * Unsafe.Add(ref multipliersRef, i)); + tableRef = ref Unsafe.Add(ref tableRef, 1); } }