From 5033e3eb950aa15a89a1ccd1f706c629344f9119 Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Mon, 18 Jan 2021 12:49:08 +0100 Subject: [PATCH] Improve algorithm --- .../Encoder/RgbToYCbCrConverterVectorized.cs | 180 ++++++------------ .../Formats/Jpg/RgbToYCbCrConverterTests.cs | 8 +- 2 files changed, 61 insertions(+), 127 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs index ddaa2069ed..209cc3c6ab 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -15,97 +15,43 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { internal static class RgbToYCbCrConverterVectorized { - private static ReadOnlySpan ExtractionMasks => new byte[] - { - 0x0, 0xFF, 0xFF, 0xFF, 0x1, 0xFF, 0xFF, 0xFF, 0x2, 0xFF, 0xFF, 0xFF, 0x3, 0xFF, 0xFF, 0xFF, 0x10, 0xFF, 0xFF, 0xFF, 0x11, 0xFF, 0xFF, 0xFF, 0x12, 0xFF, 0xFF, 0xFF, 0x13, 0xFF, 0xFF, 0xFF, - 0x4, 0xFF, 0xFF, 0xFF, 0x5, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF, 0x14, 0xFF, 0xFF, 0xFF, 0x15, 0xFF, 0xFF, 0xFF, 0x16, 0xFF, 0xFF, 0xFF, 0x17, 0xFF, 0xFF, 0xFF, - 0x8, 0xFF, 0xFF, 0xFF, 0x9, 0xFF, 0xFF, 0xFF, 0xA, 0xFF, 0xFF, 0xFF, 0xB, 0xFF, 0xFF, 0xFF, 0x18, 0xFF, 0xFF, 0xFF, 0x19, 0xFF, 0xFF, 0xFF, 0x1A, 0xFF, 0xFF, 0xFF, 0x1B, 0xFF, 0xFF, 0xFF, - 0xC, 0xFF, 0xFF, 0xFF, 0xD, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF, 0x1C, 0xFF, 0xFF, 0xFF, 0x1D, 0xFF, 0xFF, 0xFF, 0x1E, 0xFF, 0xFF, 0xFF, 0x1F, 0xFF, 0xFF, 0xFF, - }; - public static bool IsSupported { get { #if SUPPORTS_RUNTIME_INTRINSICS - return Avx2.IsSupported && Fma.IsSupported; + return Avx2.IsSupported; #else return false; #endif } } - public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) - { - Debug.Assert(IsSupported, "AVX2 and FMA are required to run this converter"); - #if SUPPORTS_RUNTIME_INTRINSICS - SeparateRgb(rgbSpan); - ConvertInternal(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); -#endif - } - -#if SUPPORTS_RUNTIME_INTRINSICS - /// - /// Rearranges the provided in-place - /// from { r00, g00, b00, ..., r63, g63, b63 } - /// to { r00, ... r31, g00, ..., g31, b00, ..., b31, - /// r32, ... r63, g32, ..., g63, b31, ..., b63 } - /// - /// - /// SSE is used for this operation as it is significantly faster than AVX in this specific case. - /// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers. - /// - [MethodImpl(InliningOptions.ShortMethod)] - private static void SeparateRgb(ReadOnlySpan rgbSpan) + private static ReadOnlySpan MoveFirst24BytesToSeparateLanes => new byte[] { - var selectRed0 = Vector128.Create(0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectRed1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectRed2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D); - - var selectGreen0 = Vector128.Create(0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectGreen1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectGreen2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E); + 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, + 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 + }; - var selectBlue0 = Vector128.Create(0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectBlue1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectBlue2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F); + private static ReadOnlySpan MoveLast24BytesToSeparateLanes => new byte[] + { + 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0 + }; - for (int i = 0; i < 2; i++) - { - ref Vector128 inRef = ref Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)), i * 6); - - Vector128 in0 = inRef; - Vector128 in1 = Unsafe.Add(ref inRef, 1); - Vector128 in2 = Unsafe.Add(ref inRef, 2); - - Vector128 r0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); - Vector128 g0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); - Vector128 b0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); - - in0 = Unsafe.Add(ref inRef, 3); - in1 = Unsafe.Add(ref inRef, 4); - in2 = Unsafe.Add(ref inRef, 5); - - Vector128 r1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); - Vector128 g1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); - Vector128 b1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); - - inRef = r0; - Unsafe.Add(ref inRef, 1) = r1; - Unsafe.Add(ref inRef, 2) = g0; - Unsafe.Add(ref inRef, 3) = g1; - Unsafe.Add(ref inRef, 4) = b0; - Unsafe.Add(ref inRef, 5) = b1; - } - } + private static ReadOnlySpan ExtractRgb => new byte[] + { + 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF + }; +#endif - /// - /// Converts the previously separated (see ) RGB values to YCbCr using AVX2 and FMA. - /// - [MethodImpl(InliningOptions.ShortMethod)] - private static void ConvertInternal(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) { + Debug.Assert(IsSupported, "AVX2 is required to run this converter"); + +#if SUPPORTS_RUNTIME_INTRINSICS var f0299 = Vector256.Create(0.299f); var f0587 = Vector256.Create(0.587f); var f0114 = Vector256.Create(0.114f); @@ -115,68 +61,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder var fn0418688 = Vector256.Create(-0.418688f); var fn0081312F = Vector256.Create(-0.081312F); var f05 = Vector256.Create(0.5f); + var zero = Vector256.Create(0).AsByte(); ref Vector256 inRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); - - for (int i = 0; i < 2; i++) + ref Vector256 destYRef = ref Unsafe.As>(ref yBlock); + ref Vector256 destCbRef = ref Unsafe.As>(ref cbBlock); + ref Vector256 destCrRef = ref Unsafe.As>(ref crBlock); + + var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); + var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); + Vector256 rgb, rg, bx; + Vector256 r, g, b; + for (int i = 0; i < 7; i++) { - ref Vector256 destYRef = ref Unsafe.Add(ref Unsafe.As>(ref yBlock), i * 4); - ref Vector256 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), i * 4); - ref Vector256 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), i * 4); - - Vector256 red = Unsafe.Add(ref inRef, i * 3); - Vector256 green = Unsafe.Add(ref inRef, (i * 3) + 1); - Vector256 blue = Unsafe.Add(ref inRef, (i * 3) + 2); + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte(); - for (int j = 0; j < 2; j++) - { - // 1st part of unrolled loop - Vector256 mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), j * 2); + rgb = Avx2.Shuffle(rgb, extractRgbMask); - Vector256 r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); - Vector256 g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); - Vector256 b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); - // (0.299F * r) + (0.587F * g) + (0.114F * b); - Vector256 yy0 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); - // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) - Vector256 cb0 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref destYRef, i) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); - // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) - Vector256 cr0 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); - // 2nd part of unrolled loop - mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), (j * 2) + 1); - - r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); - g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); - b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); + } - // (0.299F * r) + (0.587F * g) + (0.114F * b); - Vector256 yy1 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes)); + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte(); + rgb = Avx2.Shuffle(rgb, extractRgbMask); - // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) - Vector256 cb1 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); - // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) - Vector256 cr1 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); - // store results from 1st and 2nd part - Vector256 tmpY = Avx.Permute2x128(yy0, yy1, 0b0010_0001); - Unsafe.Add(ref destYRef, j) = Avx.Blend(yy0, tmpY, 0b1111_0000); - Unsafe.Add(ref destYRef, j + 2) = Avx.Blend(yy1, tmpY, 0b0000_1111); + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); - Vector256 tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001); - Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000); - Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb1, tmpCb, 0b0000_1111); + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); - Vector256 tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001); - Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000); - Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr1, tmpCr, 0b0000_1111); - } - } - } + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); #endif + } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs index 776cbb44f3..9a6fc8d6fd 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs @@ -48,17 +48,13 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Rgb24[] data = CreateTestData(); - // RgbToYCbCrConverterVectorized uses `data` as working memory so we need a copy for verification below - Rgb24[] dataCopy = new Rgb24[data.Length]; - data.CopyTo(dataCopy, 0); - Block8x8F y = default; Block8x8F cb = default; Block8x8F cr = default; RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr); - Verify(dataCopy, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F)); + Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F)); } private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer) @@ -73,7 +69,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); - Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), comparer); + Assert.True(comparer.Equals(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y} == {yResult[i]}, {cb} == {cbResult[i]}, {cr} == {crResult[i]}"); } }