From efd4d22665239b098aa1ede45231b6ed59586b64 Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Sun, 17 Jan 2021 13:54:36 +0100 Subject: [PATCH 1/5] Add initial vectorized implementation with benchmarks --- ...bCrTables.cs => RgbToYCbCrConverterLut.cs} | 35 +++- .../Encoder/RgbToYCbCrConverterVectorized.cs | 182 ++++++++++++++++++ .../Encoder/YCbCrForwardConverter{TPixel}.cs | 28 ++- .../Encoder/YCbCrForwardConverterBenchmark.cs | 56 ++++++ 4 files changed, 278 insertions(+), 23 deletions(-) rename src/ImageSharp/Formats/Jpeg/Components/Encoder/{RgbToYCbCrTables.cs => RgbToYCbCrConverterLut.cs} (79%) create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs create mode 100644 tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs similarity index 79% rename from src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs rename to src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs index 236eff27cc..835a34f652 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs @@ -1,16 +1,17 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using System.Runtime.CompilerServices; +using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { /// /// Provides 8-bit lookup tables for converting from Rgb to YCbCr colorspace. /// Methods to build the tables are based on libjpeg implementation. - /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)! /// - internal unsafe struct RgbToYCbCrTables + internal unsafe struct RgbToYCbCrConverterLut { /// /// The red luminance table @@ -63,10 +64,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// Initializes the YCbCr tables /// - /// The initialized - public static RgbToYCbCrTables Create() + /// The initialized + public static RgbToYCbCrConverterLut Create() { - RgbToYCbCrTables tables = default; + RgbToYCbCrConverterLut tables = default; for (int i = 0; i <= 255; i++) { @@ -92,11 +93,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } /// - /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)! /// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void ConvertPixelInto( + private void ConvertPixelInto( int r, int g, int b, @@ -111,10 +111,29 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits; - // float cr = MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero); + // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits; } + public void Convert(Span rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + ref Rgb24 rgbStart = ref rgbSpan[0]; + + for (int i = 0; i < 64; i++) + { + ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i); + + this.ConvertPixelInto( + c.R, + c.G, + c.B, + ref yBlock, + ref cbBlock, + ref crBlock, + i); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Fix(float x) => (int)((x * (1L << ScaleBits)) + 0.5F); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs new file mode 100644 index 0000000000..068c3db964 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -0,0 +1,182 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Diagnostics; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder +{ + internal static class RgbToYCbCrConverterVectorized + { + private static ReadOnlySpan ExtractionMasks => new byte[] + { + 0x0, 0xFF, 0xFF, 0xFF, 0x1, 0xFF, 0xFF, 0xFF, 0x2, 0xFF, 0xFF, 0xFF, 0x3, 0xFF, 0xFF, 0xFF, 0x10, 0xFF, 0xFF, 0xFF, 0x11, 0xFF, 0xFF, 0xFF, 0x12, 0xFF, 0xFF, 0xFF, 0x13, 0xFF, 0xFF, 0xFF, + 0x4, 0xFF, 0xFF, 0xFF, 0x5, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF, 0x14, 0xFF, 0xFF, 0xFF, 0x15, 0xFF, 0xFF, 0xFF, 0x16, 0xFF, 0xFF, 0xFF, 0x17, 0xFF, 0xFF, 0xFF, + 0x8, 0xFF, 0xFF, 0xFF, 0x9, 0xFF, 0xFF, 0xFF, 0xA, 0xFF, 0xFF, 0xFF, 0xB, 0xFF, 0xFF, 0xFF, 0x18, 0xFF, 0xFF, 0xFF, 0x19, 0xFF, 0xFF, 0xFF, 0x1A, 0xFF, 0xFF, 0xFF, 0x1B, 0xFF, 0xFF, 0xFF, + 0xC, 0xFF, 0xFF, 0xFF, 0xD, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF, 0x1C, 0xFF, 0xFF, 0xFF, 0x1D, 0xFF, 0xFF, 0xFF, 0x1E, 0xFF, 0xFF, 0xFF, 0x1F, 0xFF, 0xFF, 0xFF, + }; + + public static bool IsSupported + { + get + { +#if SUPPORTS_RUNTIME_INTRINSICS + return Avx2.IsSupported && Fma.IsSupported; +#else + return false; +#endif + } + } + + public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + Debug.Assert(IsSupported, "AVX2 and FMA are required to run this converter"); + +#if SUPPORTS_RUNTIME_INTRINSICS + SeparateRgb(rgbSpan); + ConvertInternal(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); +#endif + } + +#if SUPPORTS_RUNTIME_INTRINSICS + /// + /// Rearranges the provided in-place + /// from { r00, g00, b00, ..., r63, g63, b63 } + /// to { r00, ... r31, g00, ..., g31, b00, ..., b31, + /// r32, ... r63, g32, ..., g63, b31, ..., b63 } + /// + /// + /// SSE is used for this operation as it is significantly faster than AVX in this specific case. + /// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers. + /// + [MethodImpl(InliningOptions.ShortMethod)] + private static void SeparateRgb(ReadOnlySpan rgbSpan) + { + var selectRed0 = Vector128.Create(0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectRed1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectRed2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D); + + var selectGreen0 = Vector128.Create(0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectGreen1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectGreen2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E); + + var selectBlue0 = Vector128.Create(0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectBlue1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectBlue2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F); + + for (int i = 0; i < 2; i++) + { + ref Vector128 inRef = ref Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)), i * 6); + + Vector128 in0 = inRef; + Vector128 in1 = Unsafe.Add(ref inRef, 1); + Vector128 in2 = Unsafe.Add(ref inRef, 2); + + Vector128 r0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); + Vector128 g0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); + Vector128 b0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); + + in0 = Unsafe.Add(ref inRef, 3); + in1 = Unsafe.Add(ref inRef, 4); + in2 = Unsafe.Add(ref inRef, 5); + + Vector128 r1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); + Vector128 g1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); + Vector128 b1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); + + inRef = r0; + Unsafe.Add(ref inRef, 1) = r1; + Unsafe.Add(ref inRef, 2) = g0; + Unsafe.Add(ref inRef, 3) = g1; + Unsafe.Add(ref inRef, 4) = b0; + Unsafe.Add(ref inRef, 5) = b1; + } + } + + /// + /// Converts the previously separated (see ) RGB values to YCbCr using AVX2 and FMA. + /// + [MethodImpl(InliningOptions.ShortMethod)] + private static void ConvertInternal(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + var f0299 = Vector256.Create(0.299f); + var f0587 = Vector256.Create(0.587f); + var f0114 = Vector256.Create(0.114f); + var fn0168736 = Vector256.Create(-0.168736f); + var fn0331264 = Vector256.Create(-0.331264f); + var f128 = Vector256.Create(128f); + var fn0418688 = Vector256.Create(-0.418688f); + var fn0081312F = Vector256.Create(-0.081312F); + var f05 = Vector256.Create(0.5f); + + ref Vector256 inRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); + + for (int i = 0; i < 2; i++) + { + ref Vector256 destYRef = ref Unsafe.Add(ref Unsafe.As>(ref yBlock), i * 4); + ref Vector256 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), i * 4); + ref Vector256 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), i * 4); + + Vector256 red = Unsafe.Add(ref inRef, i * 3); + Vector256 green = Unsafe.Add(ref inRef, (i * 3) + 1); + Vector256 blue = Unsafe.Add(ref inRef, (i * 3) + 2); + + for (int j = 0; j < 2; j++) + { + // 1st part of unrolled loop + Vector256 mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), j * 2); + + Vector256 r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); + Vector256 g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); + Vector256 b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Vector256 yy0 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Vector256 cb0 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Vector256 cr0 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + + // 2nd part of unrolled loop + mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), (j * 2) + 1); + + r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Vector256 yy1 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Vector256 cb1 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Vector256 cr1 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + + // store results from 1st and 2nd part + Vector256 tmpY = Avx.Permute2x128(yy0, yy1, 0b0010_0001); + Unsafe.Add(ref destYRef, j) = Avx.Blend(yy0, tmpY, 0b1111_0000); + Unsafe.Add(ref destYRef, j + 2) = Avx.Blend(yy1, tmpY, 0b0000_1111); + + Vector256 tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001); + Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000); + Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb0, tmpCb, 0b0000_1111); + + Vector256 tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001); + Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000); + Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr0, tmpCr, 0b0000_1111); + } + } + } +#endif + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs index 4d6186e22f..b658993278 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs @@ -2,7 +2,6 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Runtime.CompilerServices; using SixLabors.ImageSharp.Advanced; using SixLabors.ImageSharp.PixelFormats; @@ -33,7 +32,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// The color conversion tables /// - private RgbToYCbCrTables colorTables; + private RgbToYCbCrConverterLut colorTables; /// /// Temporal 8x8 block to hold TPixel data @@ -48,7 +47,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public static YCbCrForwardConverter Create() { var result = default(YCbCrForwardConverter); - result.colorTables = RgbToYCbCrTables.Create(); + if (RgbToYCbCrConverterVectorized.IsSupported) + { + // Avoid creating lookup tables, when vectorized converter is supported + result.colorTables = RgbToYCbCrConverterLut.Create(); + } + return result; } @@ -65,20 +69,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref Block8x8F yBlock = ref this.Y; ref Block8x8F cbBlock = ref this.Cb; ref Block8x8F crBlock = ref this.Cr; - ref Rgb24 rgbStart = ref rgbSpan[0]; - for (int i = 0; i < 64; i++) + if (RgbToYCbCrConverterVectorized.IsSupported) { - ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i); - - this.colorTables.ConvertPixelInto( - c.R, - c.G, - c.B, - ref yBlock, - ref cbBlock, - ref crBlock, - i); + RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + } + else + { + this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); } } } diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs new file mode 100644 index 0000000000..1db4072932 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs @@ -0,0 +1,56 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Formats.Jpeg.Components; +using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder +{ + public class YCbCrForwardConverterBenchmark + { + private RgbToYCbCrConverterLut converter; + private Rgb24[] data; + + [GlobalSetup] + public void Setup() + { + this.converter = RgbToYCbCrConverterLut.Create(); + + var r = new Random(42); + this.data = new Rgb24[64]; + + var d = new byte[3]; + for (int i = 0; i < this.data.Length; i++) + { + r.NextBytes(d); + this.data[i] = new Rgb24(d[0], d[1], d[2]); + } + } + + [Benchmark(Baseline = true)] + public void ConvertLut() + { + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr); + } + + [Benchmark] + public void ConvertVectorized() + { + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + if (RgbToYCbCrConverterVectorized.IsSupported) + { + RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr); + } + } + } +} From 429696bd5e0ae1a5a872d8711c305228799726f9 Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Sun, 17 Jan 2021 14:55:21 +0100 Subject: [PATCH 2/5] Fix mistakes in final touches --- .../Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs | 4 ++-- .../Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs index 068c3db964..ddaa2069ed 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -169,11 +169,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder Vector256 tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001); Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000); - Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb0, tmpCb, 0b0000_1111); + Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb1, tmpCb, 0b0000_1111); Vector256 tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001); Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000); - Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr0, tmpCr, 0b0000_1111); + Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr1, tmpCr, 0b0000_1111); } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs index b658993278..8fcc63c6aa 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs @@ -47,7 +47,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public static YCbCrForwardConverter Create() { var result = default(YCbCrForwardConverter); - if (RgbToYCbCrConverterVectorized.IsSupported) + if (!RgbToYCbCrConverterVectorized.IsSupported) { // Avoid creating lookup tables, when vectorized converter is supported result.colorTables = RgbToYCbCrConverterLut.Create(); From 93099d1585e14706f85ea58682d799d4b446b8e4 Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Sun, 17 Jan 2021 22:50:32 +0100 Subject: [PATCH 3/5] Add unit tests for both converters --- .../Encoder/RgbToYCbCrConverterLut.cs | 2 +- .../Formats/Jpg/RgbToYCbCrConverterTests.cs | 98 +++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs index 835a34f652..3c1a02c5aa 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs @@ -111,7 +111,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits; - // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits; } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs new file mode 100644 index 0000000000..9134de42e5 --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs @@ -0,0 +1,98 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using SixLabors.ImageSharp.ColorSpaces; +using SixLabors.ImageSharp.Formats.Jpeg.Components; +using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder; +using SixLabors.ImageSharp.PixelFormats; +using SixLabors.ImageSharp.Tests.Colorspaces.Conversion; +using Xunit; +using Xunit.Abstractions; + +// ReSharper disable InconsistentNaming +namespace SixLabors.ImageSharp.Tests.Formats.Jpg +{ + public class RgbToYCbCrConverterTests + { + private const float Epsilon = .5F; + private static readonly ApproximateColorSpaceComparer Comparer = new ApproximateColorSpaceComparer(Epsilon); + + public RgbToYCbCrConverterTests(ITestOutputHelper output) + { + this.Output = output; + } + + private ITestOutputHelper Output { get; } + + [Fact] + public void TestLutConverter() + { + Rgb24[] data = CreateTestData(); + var target = RgbToYCbCrConverterLut.Create(); + + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + target.Convert(data.AsSpan(), ref y, ref cb, ref cr); + + Verify(data, ref y, ref cb, ref cr); + } + + [Fact] + public void TestVectorizedConverter() + { + if (!RgbToYCbCrConverterVectorized.IsSupported) + { + this.Output.WriteLine("No AVX and/or FMA present, skipping test!"); + return; + } + + Rgb24[] data = CreateTestData(); + + // RgbToYCbCrConverterVectorized uses `data` as working memory so we need a copy for verification below + Rgb24[] dataCopy = new Rgb24[data.Length]; + data.CopyTo(dataCopy, 0); + + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr); + + Verify(dataCopy, ref y, ref cb, ref cr); + } + + private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult) + { + for (int i = 0; i < data.Length; i++) + { + int r = data[i].R; + int g = data[i].G; + int b = data[i].B; + + float y = (0.299F * r) + (0.587F * g) + (0.114F * b); + float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); + float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); + + Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), Comparer); + } + } + + private static Rgb24[] CreateTestData() + { + var data = new Rgb24[64]; + var r = new Random(); + + var random = new byte[3]; + for (int i = 0; i < data.Length; i++) + { + r.NextBytes(random); + data[i] = new Rgb24(random[0], random[1], random[2]); + } + + return data; + } + } +} From 08a68af1a997c56a4e6a721cf9de7fdb1cd1f4ce Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Sun, 17 Jan 2021 23:01:18 +0100 Subject: [PATCH 4/5] Allow epsilon of 1F for existing LUT converter --- .../Formats/Jpg/RgbToYCbCrConverterTests.cs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs index 9134de42e5..776cbb44f3 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs @@ -15,9 +15,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { public class RgbToYCbCrConverterTests { - private const float Epsilon = .5F; - private static readonly ApproximateColorSpaceComparer Comparer = new ApproximateColorSpaceComparer(Epsilon); - public RgbToYCbCrConverterTests(ITestOutputHelper output) { this.Output = output; @@ -37,7 +34,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg target.Convert(data.AsSpan(), ref y, ref cb, ref cr); - Verify(data, ref y, ref cb, ref cr); + Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F)); } [Fact] @@ -61,10 +58,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr); - Verify(dataCopy, ref y, ref cb, ref cr); + Verify(dataCopy, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F)); } - private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult) + private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer) { for (int i = 0; i < data.Length; i++) { @@ -76,7 +73,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); - Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), Comparer); + Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), comparer); } } From 5033e3eb950aa15a89a1ccd1f706c629344f9119 Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Mon, 18 Jan 2021 12:49:08 +0100 Subject: [PATCH 5/5] Improve algorithm --- .../Encoder/RgbToYCbCrConverterVectorized.cs | 180 ++++++------------ .../Formats/Jpg/RgbToYCbCrConverterTests.cs | 8 +- 2 files changed, 61 insertions(+), 127 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs index ddaa2069ed..209cc3c6ab 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -15,97 +15,43 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { internal static class RgbToYCbCrConverterVectorized { - private static ReadOnlySpan ExtractionMasks => new byte[] - { - 0x0, 0xFF, 0xFF, 0xFF, 0x1, 0xFF, 0xFF, 0xFF, 0x2, 0xFF, 0xFF, 0xFF, 0x3, 0xFF, 0xFF, 0xFF, 0x10, 0xFF, 0xFF, 0xFF, 0x11, 0xFF, 0xFF, 0xFF, 0x12, 0xFF, 0xFF, 0xFF, 0x13, 0xFF, 0xFF, 0xFF, - 0x4, 0xFF, 0xFF, 0xFF, 0x5, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF, 0x14, 0xFF, 0xFF, 0xFF, 0x15, 0xFF, 0xFF, 0xFF, 0x16, 0xFF, 0xFF, 0xFF, 0x17, 0xFF, 0xFF, 0xFF, - 0x8, 0xFF, 0xFF, 0xFF, 0x9, 0xFF, 0xFF, 0xFF, 0xA, 0xFF, 0xFF, 0xFF, 0xB, 0xFF, 0xFF, 0xFF, 0x18, 0xFF, 0xFF, 0xFF, 0x19, 0xFF, 0xFF, 0xFF, 0x1A, 0xFF, 0xFF, 0xFF, 0x1B, 0xFF, 0xFF, 0xFF, - 0xC, 0xFF, 0xFF, 0xFF, 0xD, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF, 0x1C, 0xFF, 0xFF, 0xFF, 0x1D, 0xFF, 0xFF, 0xFF, 0x1E, 0xFF, 0xFF, 0xFF, 0x1F, 0xFF, 0xFF, 0xFF, - }; - public static bool IsSupported { get { #if SUPPORTS_RUNTIME_INTRINSICS - return Avx2.IsSupported && Fma.IsSupported; + return Avx2.IsSupported; #else return false; #endif } } - public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) - { - Debug.Assert(IsSupported, "AVX2 and FMA are required to run this converter"); - #if SUPPORTS_RUNTIME_INTRINSICS - SeparateRgb(rgbSpan); - ConvertInternal(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); -#endif - } - -#if SUPPORTS_RUNTIME_INTRINSICS - /// - /// Rearranges the provided in-place - /// from { r00, g00, b00, ..., r63, g63, b63 } - /// to { r00, ... r31, g00, ..., g31, b00, ..., b31, - /// r32, ... r63, g32, ..., g63, b31, ..., b63 } - /// - /// - /// SSE is used for this operation as it is significantly faster than AVX in this specific case. - /// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers. - /// - [MethodImpl(InliningOptions.ShortMethod)] - private static void SeparateRgb(ReadOnlySpan rgbSpan) + private static ReadOnlySpan MoveFirst24BytesToSeparateLanes => new byte[] { - var selectRed0 = Vector128.Create(0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectRed1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectRed2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D); - - var selectGreen0 = Vector128.Create(0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectGreen1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectGreen2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E); + 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, + 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 + }; - var selectBlue0 = Vector128.Create(0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectBlue1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectBlue2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F); + private static ReadOnlySpan MoveLast24BytesToSeparateLanes => new byte[] + { + 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0 + }; - for (int i = 0; i < 2; i++) - { - ref Vector128 inRef = ref Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)), i * 6); - - Vector128 in0 = inRef; - Vector128 in1 = Unsafe.Add(ref inRef, 1); - Vector128 in2 = Unsafe.Add(ref inRef, 2); - - Vector128 r0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); - Vector128 g0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); - Vector128 b0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); - - in0 = Unsafe.Add(ref inRef, 3); - in1 = Unsafe.Add(ref inRef, 4); - in2 = Unsafe.Add(ref inRef, 5); - - Vector128 r1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); - Vector128 g1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); - Vector128 b1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); - - inRef = r0; - Unsafe.Add(ref inRef, 1) = r1; - Unsafe.Add(ref inRef, 2) = g0; - Unsafe.Add(ref inRef, 3) = g1; - Unsafe.Add(ref inRef, 4) = b0; - Unsafe.Add(ref inRef, 5) = b1; - } - } + private static ReadOnlySpan ExtractRgb => new byte[] + { + 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF + }; +#endif - /// - /// Converts the previously separated (see ) RGB values to YCbCr using AVX2 and FMA. - /// - [MethodImpl(InliningOptions.ShortMethod)] - private static void ConvertInternal(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) { + Debug.Assert(IsSupported, "AVX2 is required to run this converter"); + +#if SUPPORTS_RUNTIME_INTRINSICS var f0299 = Vector256.Create(0.299f); var f0587 = Vector256.Create(0.587f); var f0114 = Vector256.Create(0.114f); @@ -115,68 +61,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder var fn0418688 = Vector256.Create(-0.418688f); var fn0081312F = Vector256.Create(-0.081312F); var f05 = Vector256.Create(0.5f); + var zero = Vector256.Create(0).AsByte(); ref Vector256 inRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); - - for (int i = 0; i < 2; i++) + ref Vector256 destYRef = ref Unsafe.As>(ref yBlock); + ref Vector256 destCbRef = ref Unsafe.As>(ref cbBlock); + ref Vector256 destCrRef = ref Unsafe.As>(ref crBlock); + + var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); + var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); + Vector256 rgb, rg, bx; + Vector256 r, g, b; + for (int i = 0; i < 7; i++) { - ref Vector256 destYRef = ref Unsafe.Add(ref Unsafe.As>(ref yBlock), i * 4); - ref Vector256 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), i * 4); - ref Vector256 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), i * 4); - - Vector256 red = Unsafe.Add(ref inRef, i * 3); - Vector256 green = Unsafe.Add(ref inRef, (i * 3) + 1); - Vector256 blue = Unsafe.Add(ref inRef, (i * 3) + 2); + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte(); - for (int j = 0; j < 2; j++) - { - // 1st part of unrolled loop - Vector256 mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), j * 2); + rgb = Avx2.Shuffle(rgb, extractRgbMask); - Vector256 r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); - Vector256 g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); - Vector256 b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); - // (0.299F * r) + (0.587F * g) + (0.114F * b); - Vector256 yy0 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); - // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) - Vector256 cb0 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref destYRef, i) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); - // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) - Vector256 cr0 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); - // 2nd part of unrolled loop - mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), (j * 2) + 1); - - r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); - g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); - b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); + } - // (0.299F * r) + (0.587F * g) + (0.114F * b); - Vector256 yy1 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes)); + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte(); + rgb = Avx2.Shuffle(rgb, extractRgbMask); - // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) - Vector256 cb1 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); - // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) - Vector256 cr1 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); - // store results from 1st and 2nd part - Vector256 tmpY = Avx.Permute2x128(yy0, yy1, 0b0010_0001); - Unsafe.Add(ref destYRef, j) = Avx.Blend(yy0, tmpY, 0b1111_0000); - Unsafe.Add(ref destYRef, j + 2) = Avx.Blend(yy1, tmpY, 0b0000_1111); + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); - Vector256 tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001); - Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000); - Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb1, tmpCb, 0b0000_1111); + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); - Vector256 tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001); - Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000); - Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr1, tmpCr, 0b0000_1111); - } - } - } + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); #endif + } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs index 776cbb44f3..9a6fc8d6fd 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs @@ -48,17 +48,13 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Rgb24[] data = CreateTestData(); - // RgbToYCbCrConverterVectorized uses `data` as working memory so we need a copy for verification below - Rgb24[] dataCopy = new Rgb24[data.Length]; - data.CopyTo(dataCopy, 0); - Block8x8F y = default; Block8x8F cb = default; Block8x8F cr = default; RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr); - Verify(dataCopy, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F)); + Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F)); } private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer) @@ -73,7 +69,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); - Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), comparer); + Assert.True(comparer.Equals(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y} == {yResult[i]}, {cb} == {cbResult[i]}, {cr} == {crResult[i]}"); } }