From efd4d22665239b098aa1ede45231b6ed59586b64 Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Sun, 17 Jan 2021 13:54:36 +0100 Subject: [PATCH] Add initial vectorized implementation with benchmarks --- ...bCrTables.cs => RgbToYCbCrConverterLut.cs} | 35 +++- .../Encoder/RgbToYCbCrConverterVectorized.cs | 182 ++++++++++++++++++ .../Encoder/YCbCrForwardConverter{TPixel}.cs | 28 ++- .../Encoder/YCbCrForwardConverterBenchmark.cs | 56 ++++++ 4 files changed, 278 insertions(+), 23 deletions(-) rename src/ImageSharp/Formats/Jpeg/Components/Encoder/{RgbToYCbCrTables.cs => RgbToYCbCrConverterLut.cs} (79%) create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs create mode 100644 tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs similarity index 79% rename from src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs rename to src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs index 236eff27c..835a34f65 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs @@ -1,16 +1,17 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using System.Runtime.CompilerServices; +using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { /// /// Provides 8-bit lookup tables for converting from Rgb to YCbCr colorspace. /// Methods to build the tables are based on libjpeg implementation. - /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)! /// - internal unsafe struct RgbToYCbCrTables + internal unsafe struct RgbToYCbCrConverterLut { /// /// The red luminance table @@ -63,10 +64,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// Initializes the YCbCr tables /// - /// The initialized - public static RgbToYCbCrTables Create() + /// The initialized + public static RgbToYCbCrConverterLut Create() { - RgbToYCbCrTables tables = default; + RgbToYCbCrConverterLut tables = default; for (int i = 0; i <= 255; i++) { @@ -92,11 +93,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } /// - /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)! /// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void ConvertPixelInto( + private void ConvertPixelInto( int r, int g, int b, @@ -111,10 +111,29 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits; - // float cr = MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero); + // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits; } + public void Convert(Span rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + ref Rgb24 rgbStart = ref rgbSpan[0]; + + for (int i = 0; i < 64; i++) + { + ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i); + + this.ConvertPixelInto( + c.R, + c.G, + c.B, + ref yBlock, + ref cbBlock, + ref crBlock, + i); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Fix(float x) => (int)((x * (1L << ScaleBits)) + 0.5F); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs new file mode 100644 index 000000000..068c3db96 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -0,0 +1,182 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Diagnostics; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder +{ + internal static class RgbToYCbCrConverterVectorized + { + private static ReadOnlySpan ExtractionMasks => new byte[] + { + 0x0, 0xFF, 0xFF, 0xFF, 0x1, 0xFF, 0xFF, 0xFF, 0x2, 0xFF, 0xFF, 0xFF, 0x3, 0xFF, 0xFF, 0xFF, 0x10, 0xFF, 0xFF, 0xFF, 0x11, 0xFF, 0xFF, 0xFF, 0x12, 0xFF, 0xFF, 0xFF, 0x13, 0xFF, 0xFF, 0xFF, + 0x4, 0xFF, 0xFF, 0xFF, 0x5, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF, 0x14, 0xFF, 0xFF, 0xFF, 0x15, 0xFF, 0xFF, 0xFF, 0x16, 0xFF, 0xFF, 0xFF, 0x17, 0xFF, 0xFF, 0xFF, + 0x8, 0xFF, 0xFF, 0xFF, 0x9, 0xFF, 0xFF, 0xFF, 0xA, 0xFF, 0xFF, 0xFF, 0xB, 0xFF, 0xFF, 0xFF, 0x18, 0xFF, 0xFF, 0xFF, 0x19, 0xFF, 0xFF, 0xFF, 0x1A, 0xFF, 0xFF, 0xFF, 0x1B, 0xFF, 0xFF, 0xFF, + 0xC, 0xFF, 0xFF, 0xFF, 0xD, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF, 0x1C, 0xFF, 0xFF, 0xFF, 0x1D, 0xFF, 0xFF, 0xFF, 0x1E, 0xFF, 0xFF, 0xFF, 0x1F, 0xFF, 0xFF, 0xFF, + }; + + public static bool IsSupported + { + get + { +#if SUPPORTS_RUNTIME_INTRINSICS + return Avx2.IsSupported && Fma.IsSupported; +#else + return false; +#endif + } + } + + public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + Debug.Assert(IsSupported, "AVX2 and FMA are required to run this converter"); + +#if SUPPORTS_RUNTIME_INTRINSICS + SeparateRgb(rgbSpan); + ConvertInternal(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); +#endif + } + +#if SUPPORTS_RUNTIME_INTRINSICS + /// + /// Rearranges the provided in-place + /// from { r00, g00, b00, ..., r63, g63, b63 } + /// to { r00, ... r31, g00, ..., g31, b00, ..., b31, + /// r32, ... r63, g32, ..., g63, b31, ..., b63 } + /// + /// + /// SSE is used for this operation as it is significantly faster than AVX in this specific case. + /// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers. + /// + [MethodImpl(InliningOptions.ShortMethod)] + private static void SeparateRgb(ReadOnlySpan rgbSpan) + { + var selectRed0 = Vector128.Create(0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectRed1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectRed2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D); + + var selectGreen0 = Vector128.Create(0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectGreen1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectGreen2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E); + + var selectBlue0 = Vector128.Create(0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectBlue1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectBlue2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F); + + for (int i = 0; i < 2; i++) + { + ref Vector128 inRef = ref Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)), i * 6); + + Vector128 in0 = inRef; + Vector128 in1 = Unsafe.Add(ref inRef, 1); + Vector128 in2 = Unsafe.Add(ref inRef, 2); + + Vector128 r0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); + Vector128 g0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); + Vector128 b0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); + + in0 = Unsafe.Add(ref inRef, 3); + in1 = Unsafe.Add(ref inRef, 4); + in2 = Unsafe.Add(ref inRef, 5); + + Vector128 r1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); + Vector128 g1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); + Vector128 b1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); + + inRef = r0; + Unsafe.Add(ref inRef, 1) = r1; + Unsafe.Add(ref inRef, 2) = g0; + Unsafe.Add(ref inRef, 3) = g1; + Unsafe.Add(ref inRef, 4) = b0; + Unsafe.Add(ref inRef, 5) = b1; + } + } + + /// + /// Converts the previously separated (see ) RGB values to YCbCr using AVX2 and FMA. + /// + [MethodImpl(InliningOptions.ShortMethod)] + private static void ConvertInternal(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + var f0299 = Vector256.Create(0.299f); + var f0587 = Vector256.Create(0.587f); + var f0114 = Vector256.Create(0.114f); + var fn0168736 = Vector256.Create(-0.168736f); + var fn0331264 = Vector256.Create(-0.331264f); + var f128 = Vector256.Create(128f); + var fn0418688 = Vector256.Create(-0.418688f); + var fn0081312F = Vector256.Create(-0.081312F); + var f05 = Vector256.Create(0.5f); + + ref Vector256 inRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); + + for (int i = 0; i < 2; i++) + { + ref Vector256 destYRef = ref Unsafe.Add(ref Unsafe.As>(ref yBlock), i * 4); + ref Vector256 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), i * 4); + ref Vector256 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), i * 4); + + Vector256 red = Unsafe.Add(ref inRef, i * 3); + Vector256 green = Unsafe.Add(ref inRef, (i * 3) + 1); + Vector256 blue = Unsafe.Add(ref inRef, (i * 3) + 2); + + for (int j = 0; j < 2; j++) + { + // 1st part of unrolled loop + Vector256 mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), j * 2); + + Vector256 r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); + Vector256 g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); + Vector256 b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Vector256 yy0 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Vector256 cb0 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Vector256 cr0 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + + // 2nd part of unrolled loop + mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), (j * 2) + 1); + + r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Vector256 yy1 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Vector256 cb1 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Vector256 cr1 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + + // store results from 1st and 2nd part + Vector256 tmpY = Avx.Permute2x128(yy0, yy1, 0b0010_0001); + Unsafe.Add(ref destYRef, j) = Avx.Blend(yy0, tmpY, 0b1111_0000); + Unsafe.Add(ref destYRef, j + 2) = Avx.Blend(yy1, tmpY, 0b0000_1111); + + Vector256 tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001); + Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000); + Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb0, tmpCb, 0b0000_1111); + + Vector256 tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001); + Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000); + Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr0, tmpCr, 0b0000_1111); + } + } + } +#endif + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs index 4d6186e22..b65899327 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs @@ -2,7 +2,6 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Runtime.CompilerServices; using SixLabors.ImageSharp.Advanced; using SixLabors.ImageSharp.PixelFormats; @@ -33,7 +32,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// The color conversion tables /// - private RgbToYCbCrTables colorTables; + private RgbToYCbCrConverterLut colorTables; /// /// Temporal 8x8 block to hold TPixel data @@ -48,7 +47,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public static YCbCrForwardConverter Create() { var result = default(YCbCrForwardConverter); - result.colorTables = RgbToYCbCrTables.Create(); + if (RgbToYCbCrConverterVectorized.IsSupported) + { + // Avoid creating lookup tables, when vectorized converter is supported + result.colorTables = RgbToYCbCrConverterLut.Create(); + } + return result; } @@ -65,20 +69,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref Block8x8F yBlock = ref this.Y; ref Block8x8F cbBlock = ref this.Cb; ref Block8x8F crBlock = ref this.Cr; - ref Rgb24 rgbStart = ref rgbSpan[0]; - for (int i = 0; i < 64; i++) + if (RgbToYCbCrConverterVectorized.IsSupported) { - ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i); - - this.colorTables.ConvertPixelInto( - c.R, - c.G, - c.B, - ref yBlock, - ref cbBlock, - ref crBlock, - i); + RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + } + else + { + this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); } } } diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs new file mode 100644 index 000000000..1db407293 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs @@ -0,0 +1,56 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Formats.Jpeg.Components; +using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder +{ + public class YCbCrForwardConverterBenchmark + { + private RgbToYCbCrConverterLut converter; + private Rgb24[] data; + + [GlobalSetup] + public void Setup() + { + this.converter = RgbToYCbCrConverterLut.Create(); + + var r = new Random(42); + this.data = new Rgb24[64]; + + var d = new byte[3]; + for (int i = 0; i < this.data.Length; i++) + { + r.NextBytes(d); + this.data[i] = new Rgb24(d[0], d[1], d[2]); + } + } + + [Benchmark(Baseline = true)] + public void ConvertLut() + { + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr); + } + + [Benchmark] + public void ConvertVectorized() + { + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + if (RgbToYCbCrConverterVectorized.IsSupported) + { + RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr); + } + } + } +}