From efd4d22665239b098aa1ede45231b6ed59586b64 Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Sun, 17 Jan 2021 13:54:36 +0100 Subject: [PATCH 01/13] Add initial vectorized implementation with benchmarks --- ...bCrTables.cs => RgbToYCbCrConverterLut.cs} | 35 +++- .../Encoder/RgbToYCbCrConverterVectorized.cs | 182 ++++++++++++++++++ .../Encoder/YCbCrForwardConverter{TPixel}.cs | 28 ++- .../Encoder/YCbCrForwardConverterBenchmark.cs | 56 ++++++ 4 files changed, 278 insertions(+), 23 deletions(-) rename src/ImageSharp/Formats/Jpeg/Components/Encoder/{RgbToYCbCrTables.cs => RgbToYCbCrConverterLut.cs} (79%) create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs create mode 100644 tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs similarity index 79% rename from src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs rename to src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs index 236eff27cc..835a34f652 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs @@ -1,16 +1,17 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using System.Runtime.CompilerServices; +using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { /// /// Provides 8-bit lookup tables for converting from Rgb to YCbCr colorspace. /// Methods to build the tables are based on libjpeg implementation. - /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)! /// - internal unsafe struct RgbToYCbCrTables + internal unsafe struct RgbToYCbCrConverterLut { /// /// The red luminance table @@ -63,10 +64,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// Initializes the YCbCr tables /// - /// The initialized - public static RgbToYCbCrTables Create() + /// The initialized + public static RgbToYCbCrConverterLut Create() { - RgbToYCbCrTables tables = default; + RgbToYCbCrConverterLut tables = default; for (int i = 0; i <= 255; i++) { @@ -92,11 +93,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } /// - /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)! /// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void ConvertPixelInto( + private void ConvertPixelInto( int r, int g, int b, @@ -111,10 +111,29 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits; - // float cr = MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero); + // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits; } + public void Convert(Span rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + ref Rgb24 rgbStart = ref rgbSpan[0]; + + for (int i = 0; i < 64; i++) + { + ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i); + + this.ConvertPixelInto( + c.R, + c.G, + c.B, + ref yBlock, + ref cbBlock, + ref crBlock, + i); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Fix(float x) => (int)((x * (1L << ScaleBits)) + 0.5F); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs new file mode 100644 index 0000000000..068c3db964 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -0,0 +1,182 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Diagnostics; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder +{ + internal static class RgbToYCbCrConverterVectorized + { + private static ReadOnlySpan ExtractionMasks => new byte[] + { + 0x0, 0xFF, 0xFF, 0xFF, 0x1, 0xFF, 0xFF, 0xFF, 0x2, 0xFF, 0xFF, 0xFF, 0x3, 0xFF, 0xFF, 0xFF, 0x10, 0xFF, 0xFF, 0xFF, 0x11, 0xFF, 0xFF, 0xFF, 0x12, 0xFF, 0xFF, 0xFF, 0x13, 0xFF, 0xFF, 0xFF, + 0x4, 0xFF, 0xFF, 0xFF, 0x5, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF, 0x14, 0xFF, 0xFF, 0xFF, 0x15, 0xFF, 0xFF, 0xFF, 0x16, 0xFF, 0xFF, 0xFF, 0x17, 0xFF, 0xFF, 0xFF, + 0x8, 0xFF, 0xFF, 0xFF, 0x9, 0xFF, 0xFF, 0xFF, 0xA, 0xFF, 0xFF, 0xFF, 0xB, 0xFF, 0xFF, 0xFF, 0x18, 0xFF, 0xFF, 0xFF, 0x19, 0xFF, 0xFF, 0xFF, 0x1A, 0xFF, 0xFF, 0xFF, 0x1B, 0xFF, 0xFF, 0xFF, + 0xC, 0xFF, 0xFF, 0xFF, 0xD, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF, 0x1C, 0xFF, 0xFF, 0xFF, 0x1D, 0xFF, 0xFF, 0xFF, 0x1E, 0xFF, 0xFF, 0xFF, 0x1F, 0xFF, 0xFF, 0xFF, + }; + + public static bool IsSupported + { + get + { +#if SUPPORTS_RUNTIME_INTRINSICS + return Avx2.IsSupported && Fma.IsSupported; +#else + return false; +#endif + } + } + + public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + Debug.Assert(IsSupported, "AVX2 and FMA are required to run this converter"); + +#if SUPPORTS_RUNTIME_INTRINSICS + SeparateRgb(rgbSpan); + ConvertInternal(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); +#endif + } + +#if SUPPORTS_RUNTIME_INTRINSICS + /// + /// Rearranges the provided in-place + /// from { r00, g00, b00, ..., r63, g63, b63 } + /// to { r00, ... r31, g00, ..., g31, b00, ..., b31, + /// r32, ... r63, g32, ..., g63, b31, ..., b63 } + /// + /// + /// SSE is used for this operation as it is significantly faster than AVX in this specific case. + /// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers. + /// + [MethodImpl(InliningOptions.ShortMethod)] + private static void SeparateRgb(ReadOnlySpan rgbSpan) + { + var selectRed0 = Vector128.Create(0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectRed1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectRed2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D); + + var selectGreen0 = Vector128.Create(0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectGreen1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectGreen2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E); + + var selectBlue0 = Vector128.Create(0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectBlue1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + var selectBlue2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F); + + for (int i = 0; i < 2; i++) + { + ref Vector128 inRef = ref Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)), i * 6); + + Vector128 in0 = inRef; + Vector128 in1 = Unsafe.Add(ref inRef, 1); + Vector128 in2 = Unsafe.Add(ref inRef, 2); + + Vector128 r0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); + Vector128 g0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); + Vector128 b0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); + + in0 = Unsafe.Add(ref inRef, 3); + in1 = Unsafe.Add(ref inRef, 4); + in2 = Unsafe.Add(ref inRef, 5); + + Vector128 r1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); + Vector128 g1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); + Vector128 b1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); + + inRef = r0; + Unsafe.Add(ref inRef, 1) = r1; + Unsafe.Add(ref inRef, 2) = g0; + Unsafe.Add(ref inRef, 3) = g1; + Unsafe.Add(ref inRef, 4) = b0; + Unsafe.Add(ref inRef, 5) = b1; + } + } + + /// + /// Converts the previously separated (see ) RGB values to YCbCr using AVX2 and FMA. + /// + [MethodImpl(InliningOptions.ShortMethod)] + private static void ConvertInternal(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + var f0299 = Vector256.Create(0.299f); + var f0587 = Vector256.Create(0.587f); + var f0114 = Vector256.Create(0.114f); + var fn0168736 = Vector256.Create(-0.168736f); + var fn0331264 = Vector256.Create(-0.331264f); + var f128 = Vector256.Create(128f); + var fn0418688 = Vector256.Create(-0.418688f); + var fn0081312F = Vector256.Create(-0.081312F); + var f05 = Vector256.Create(0.5f); + + ref Vector256 inRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); + + for (int i = 0; i < 2; i++) + { + ref Vector256 destYRef = ref Unsafe.Add(ref Unsafe.As>(ref yBlock), i * 4); + ref Vector256 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), i * 4); + ref Vector256 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), i * 4); + + Vector256 red = Unsafe.Add(ref inRef, i * 3); + Vector256 green = Unsafe.Add(ref inRef, (i * 3) + 1); + Vector256 blue = Unsafe.Add(ref inRef, (i * 3) + 2); + + for (int j = 0; j < 2; j++) + { + // 1st part of unrolled loop + Vector256 mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), j * 2); + + Vector256 r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); + Vector256 g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); + Vector256 b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Vector256 yy0 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Vector256 cb0 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Vector256 cr0 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + + // 2nd part of unrolled loop + mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), (j * 2) + 1); + + r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Vector256 yy1 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Vector256 cb1 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Vector256 cr1 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + + // store results from 1st and 2nd part + Vector256 tmpY = Avx.Permute2x128(yy0, yy1, 0b0010_0001); + Unsafe.Add(ref destYRef, j) = Avx.Blend(yy0, tmpY, 0b1111_0000); + Unsafe.Add(ref destYRef, j + 2) = Avx.Blend(yy1, tmpY, 0b0000_1111); + + Vector256 tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001); + Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000); + Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb0, tmpCb, 0b0000_1111); + + Vector256 tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001); + Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000); + Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr0, tmpCr, 0b0000_1111); + } + } + } +#endif + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs index 4d6186e22f..b658993278 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs @@ -2,7 +2,6 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Runtime.CompilerServices; using SixLabors.ImageSharp.Advanced; using SixLabors.ImageSharp.PixelFormats; @@ -33,7 +32,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// The color conversion tables /// - private RgbToYCbCrTables colorTables; + private RgbToYCbCrConverterLut colorTables; /// /// Temporal 8x8 block to hold TPixel data @@ -48,7 +47,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public static YCbCrForwardConverter Create() { var result = default(YCbCrForwardConverter); - result.colorTables = RgbToYCbCrTables.Create(); + if (RgbToYCbCrConverterVectorized.IsSupported) + { + // Avoid creating lookup tables, when vectorized converter is supported + result.colorTables = RgbToYCbCrConverterLut.Create(); + } + return result; } @@ -65,20 +69,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref Block8x8F yBlock = ref this.Y; ref Block8x8F cbBlock = ref this.Cb; ref Block8x8F crBlock = ref this.Cr; - ref Rgb24 rgbStart = ref rgbSpan[0]; - for (int i = 0; i < 64; i++) + if (RgbToYCbCrConverterVectorized.IsSupported) { - ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i); - - this.colorTables.ConvertPixelInto( - c.R, - c.G, - c.B, - ref yBlock, - ref cbBlock, - ref crBlock, - i); + RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + } + else + { + this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); } } } diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs new file mode 100644 index 0000000000..1db4072932 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs @@ -0,0 +1,56 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Formats.Jpeg.Components; +using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder +{ + public class YCbCrForwardConverterBenchmark + { + private RgbToYCbCrConverterLut converter; + private Rgb24[] data; + + [GlobalSetup] + public void Setup() + { + this.converter = RgbToYCbCrConverterLut.Create(); + + var r = new Random(42); + this.data = new Rgb24[64]; + + var d = new byte[3]; + for (int i = 0; i < this.data.Length; i++) + { + r.NextBytes(d); + this.data[i] = new Rgb24(d[0], d[1], d[2]); + } + } + + [Benchmark(Baseline = true)] + public void ConvertLut() + { + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr); + } + + [Benchmark] + public void ConvertVectorized() + { + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + if (RgbToYCbCrConverterVectorized.IsSupported) + { + RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr); + } + } + } +} From 429696bd5e0ae1a5a872d8711c305228799726f9 Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Sun, 17 Jan 2021 14:55:21 +0100 Subject: [PATCH 02/13] Fix mistakes in final touches --- .../Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs | 4 ++-- .../Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs index 068c3db964..ddaa2069ed 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -169,11 +169,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder Vector256 tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001); Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000); - Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb0, tmpCb, 0b0000_1111); + Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb1, tmpCb, 0b0000_1111); Vector256 tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001); Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000); - Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr0, tmpCr, 0b0000_1111); + Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr1, tmpCr, 0b0000_1111); } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs index b658993278..8fcc63c6aa 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs @@ -47,7 +47,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public static YCbCrForwardConverter Create() { var result = default(YCbCrForwardConverter); - if (RgbToYCbCrConverterVectorized.IsSupported) + if (!RgbToYCbCrConverterVectorized.IsSupported) { // Avoid creating lookup tables, when vectorized converter is supported result.colorTables = RgbToYCbCrConverterLut.Create(); From 93099d1585e14706f85ea58682d799d4b446b8e4 Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Sun, 17 Jan 2021 22:50:32 +0100 Subject: [PATCH 03/13] Add unit tests for both converters --- .../Encoder/RgbToYCbCrConverterLut.cs | 2 +- .../Formats/Jpg/RgbToYCbCrConverterTests.cs | 98 +++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs index 835a34f652..3c1a02c5aa 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs @@ -111,7 +111,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits; - // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits; } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs new file mode 100644 index 0000000000..9134de42e5 --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs @@ -0,0 +1,98 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using SixLabors.ImageSharp.ColorSpaces; +using SixLabors.ImageSharp.Formats.Jpeg.Components; +using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder; +using SixLabors.ImageSharp.PixelFormats; +using SixLabors.ImageSharp.Tests.Colorspaces.Conversion; +using Xunit; +using Xunit.Abstractions; + +// ReSharper disable InconsistentNaming +namespace SixLabors.ImageSharp.Tests.Formats.Jpg +{ + public class RgbToYCbCrConverterTests + { + private const float Epsilon = .5F; + private static readonly ApproximateColorSpaceComparer Comparer = new ApproximateColorSpaceComparer(Epsilon); + + public RgbToYCbCrConverterTests(ITestOutputHelper output) + { + this.Output = output; + } + + private ITestOutputHelper Output { get; } + + [Fact] + public void TestLutConverter() + { + Rgb24[] data = CreateTestData(); + var target = RgbToYCbCrConverterLut.Create(); + + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + target.Convert(data.AsSpan(), ref y, ref cb, ref cr); + + Verify(data, ref y, ref cb, ref cr); + } + + [Fact] + public void TestVectorizedConverter() + { + if (!RgbToYCbCrConverterVectorized.IsSupported) + { + this.Output.WriteLine("No AVX and/or FMA present, skipping test!"); + return; + } + + Rgb24[] data = CreateTestData(); + + // RgbToYCbCrConverterVectorized uses `data` as working memory so we need a copy for verification below + Rgb24[] dataCopy = new Rgb24[data.Length]; + data.CopyTo(dataCopy, 0); + + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr); + + Verify(dataCopy, ref y, ref cb, ref cr); + } + + private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult) + { + for (int i = 0; i < data.Length; i++) + { + int r = data[i].R; + int g = data[i].G; + int b = data[i].B; + + float y = (0.299F * r) + (0.587F * g) + (0.114F * b); + float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); + float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); + + Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), Comparer); + } + } + + private static Rgb24[] CreateTestData() + { + var data = new Rgb24[64]; + var r = new Random(); + + var random = new byte[3]; + for (int i = 0; i < data.Length; i++) + { + r.NextBytes(random); + data[i] = new Rgb24(random[0], random[1], random[2]); + } + + return data; + } + } +} From 08a68af1a997c56a4e6a721cf9de7fdb1cd1f4ce Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Sun, 17 Jan 2021 23:01:18 +0100 Subject: [PATCH 04/13] Allow epsilon of 1F for existing LUT converter --- .../Formats/Jpg/RgbToYCbCrConverterTests.cs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs index 9134de42e5..776cbb44f3 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs @@ -15,9 +15,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { public class RgbToYCbCrConverterTests { - private const float Epsilon = .5F; - private static readonly ApproximateColorSpaceComparer Comparer = new ApproximateColorSpaceComparer(Epsilon); - public RgbToYCbCrConverterTests(ITestOutputHelper output) { this.Output = output; @@ -37,7 +34,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg target.Convert(data.AsSpan(), ref y, ref cb, ref cr); - Verify(data, ref y, ref cb, ref cr); + Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F)); } [Fact] @@ -61,10 +58,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr); - Verify(dataCopy, ref y, ref cb, ref cr); + Verify(dataCopy, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F)); } - private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult) + private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer) { for (int i = 0; i < data.Length; i++) { @@ -76,7 +73,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); - Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), Comparer); + Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), comparer); } } From 5033e3eb950aa15a89a1ccd1f706c629344f9119 Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Mon, 18 Jan 2021 12:49:08 +0100 Subject: [PATCH 05/13] Improve algorithm --- .../Encoder/RgbToYCbCrConverterVectorized.cs | 180 ++++++------------ .../Formats/Jpg/RgbToYCbCrConverterTests.cs | 8 +- 2 files changed, 61 insertions(+), 127 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs index ddaa2069ed..209cc3c6ab 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -15,97 +15,43 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { internal static class RgbToYCbCrConverterVectorized { - private static ReadOnlySpan ExtractionMasks => new byte[] - { - 0x0, 0xFF, 0xFF, 0xFF, 0x1, 0xFF, 0xFF, 0xFF, 0x2, 0xFF, 0xFF, 0xFF, 0x3, 0xFF, 0xFF, 0xFF, 0x10, 0xFF, 0xFF, 0xFF, 0x11, 0xFF, 0xFF, 0xFF, 0x12, 0xFF, 0xFF, 0xFF, 0x13, 0xFF, 0xFF, 0xFF, - 0x4, 0xFF, 0xFF, 0xFF, 0x5, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF, 0x14, 0xFF, 0xFF, 0xFF, 0x15, 0xFF, 0xFF, 0xFF, 0x16, 0xFF, 0xFF, 0xFF, 0x17, 0xFF, 0xFF, 0xFF, - 0x8, 0xFF, 0xFF, 0xFF, 0x9, 0xFF, 0xFF, 0xFF, 0xA, 0xFF, 0xFF, 0xFF, 0xB, 0xFF, 0xFF, 0xFF, 0x18, 0xFF, 0xFF, 0xFF, 0x19, 0xFF, 0xFF, 0xFF, 0x1A, 0xFF, 0xFF, 0xFF, 0x1B, 0xFF, 0xFF, 0xFF, - 0xC, 0xFF, 0xFF, 0xFF, 0xD, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF, 0x1C, 0xFF, 0xFF, 0xFF, 0x1D, 0xFF, 0xFF, 0xFF, 0x1E, 0xFF, 0xFF, 0xFF, 0x1F, 0xFF, 0xFF, 0xFF, - }; - public static bool IsSupported { get { #if SUPPORTS_RUNTIME_INTRINSICS - return Avx2.IsSupported && Fma.IsSupported; + return Avx2.IsSupported; #else return false; #endif } } - public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) - { - Debug.Assert(IsSupported, "AVX2 and FMA are required to run this converter"); - #if SUPPORTS_RUNTIME_INTRINSICS - SeparateRgb(rgbSpan); - ConvertInternal(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); -#endif - } - -#if SUPPORTS_RUNTIME_INTRINSICS - /// - /// Rearranges the provided in-place - /// from { r00, g00, b00, ..., r63, g63, b63 } - /// to { r00, ... r31, g00, ..., g31, b00, ..., b31, - /// r32, ... r63, g32, ..., g63, b31, ..., b63 } - /// - /// - /// SSE is used for this operation as it is significantly faster than AVX in this specific case. - /// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers. - /// - [MethodImpl(InliningOptions.ShortMethod)] - private static void SeparateRgb(ReadOnlySpan rgbSpan) + private static ReadOnlySpan MoveFirst24BytesToSeparateLanes => new byte[] { - var selectRed0 = Vector128.Create(0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectRed1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectRed2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D); - - var selectGreen0 = Vector128.Create(0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectGreen1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectGreen2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E); + 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, + 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 + }; - var selectBlue0 = Vector128.Create(0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectBlue1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); - var selectBlue2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F); + private static ReadOnlySpan MoveLast24BytesToSeparateLanes => new byte[] + { + 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0 + }; - for (int i = 0; i < 2; i++) - { - ref Vector128 inRef = ref Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)), i * 6); - - Vector128 in0 = inRef; - Vector128 in1 = Unsafe.Add(ref inRef, 1); - Vector128 in2 = Unsafe.Add(ref inRef, 2); - - Vector128 r0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); - Vector128 g0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); - Vector128 b0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); - - in0 = Unsafe.Add(ref inRef, 3); - in1 = Unsafe.Add(ref inRef, 4); - in2 = Unsafe.Add(ref inRef, 5); - - Vector128 r1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2)); - Vector128 g1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2)); - Vector128 b1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2)); - - inRef = r0; - Unsafe.Add(ref inRef, 1) = r1; - Unsafe.Add(ref inRef, 2) = g0; - Unsafe.Add(ref inRef, 3) = g1; - Unsafe.Add(ref inRef, 4) = b0; - Unsafe.Add(ref inRef, 5) = b1; - } - } + private static ReadOnlySpan ExtractRgb => new byte[] + { + 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF + }; +#endif - /// - /// Converts the previously separated (see ) RGB values to YCbCr using AVX2 and FMA. - /// - [MethodImpl(InliningOptions.ShortMethod)] - private static void ConvertInternal(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) { + Debug.Assert(IsSupported, "AVX2 is required to run this converter"); + +#if SUPPORTS_RUNTIME_INTRINSICS var f0299 = Vector256.Create(0.299f); var f0587 = Vector256.Create(0.587f); var f0114 = Vector256.Create(0.114f); @@ -115,68 +61,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder var fn0418688 = Vector256.Create(-0.418688f); var fn0081312F = Vector256.Create(-0.081312F); var f05 = Vector256.Create(0.5f); + var zero = Vector256.Create(0).AsByte(); ref Vector256 inRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); - - for (int i = 0; i < 2; i++) + ref Vector256 destYRef = ref Unsafe.As>(ref yBlock); + ref Vector256 destCbRef = ref Unsafe.As>(ref cbBlock); + ref Vector256 destCrRef = ref Unsafe.As>(ref crBlock); + + var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); + var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); + Vector256 rgb, rg, bx; + Vector256 r, g, b; + for (int i = 0; i < 7; i++) { - ref Vector256 destYRef = ref Unsafe.Add(ref Unsafe.As>(ref yBlock), i * 4); - ref Vector256 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), i * 4); - ref Vector256 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), i * 4); - - Vector256 red = Unsafe.Add(ref inRef, i * 3); - Vector256 green = Unsafe.Add(ref inRef, (i * 3) + 1); - Vector256 blue = Unsafe.Add(ref inRef, (i * 3) + 2); + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte(); - for (int j = 0; j < 2; j++) - { - // 1st part of unrolled loop - Vector256 mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), j * 2); + rgb = Avx2.Shuffle(rgb, extractRgbMask); - Vector256 r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); - Vector256 g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); - Vector256 b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); - // (0.299F * r) + (0.587F * g) + (0.114F * b); - Vector256 yy0 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); - // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) - Vector256 cb0 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref destYRef, i) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); - // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) - Vector256 cr0 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); - // 2nd part of unrolled loop - mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), (j * 2) + 1); - - r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32()); - g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32()); - b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32()); + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); + } - // (0.299F * r) + (0.587F * g) + (0.114F * b); - Vector256 yy1 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b))); + extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes)); + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte(); + rgb = Avx2.Shuffle(rgb, extractRgbMask); - // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) - Vector256 cb1 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b)))); + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); - // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) - Vector256 cr1 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b)))); + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); - // store results from 1st and 2nd part - Vector256 tmpY = Avx.Permute2x128(yy0, yy1, 0b0010_0001); - Unsafe.Add(ref destYRef, j) = Avx.Blend(yy0, tmpY, 0b1111_0000); - Unsafe.Add(ref destYRef, j + 2) = Avx.Blend(yy1, tmpY, 0b0000_1111); + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); - Vector256 tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001); - Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000); - Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb1, tmpCb, 0b0000_1111); + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); - Vector256 tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001); - Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000); - Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr1, tmpCr, 0b0000_1111); - } - } - } + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); #endif + } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs index 776cbb44f3..9a6fc8d6fd 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs @@ -48,17 +48,13 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Rgb24[] data = CreateTestData(); - // RgbToYCbCrConverterVectorized uses `data` as working memory so we need a copy for verification below - Rgb24[] dataCopy = new Rgb24[data.Length]; - data.CopyTo(dataCopy, 0); - Block8x8F y = default; Block8x8F cb = default; Block8x8F cr = default; RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr); - Verify(dataCopy, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F)); + Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F)); } private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer) @@ -73,7 +69,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); - Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), comparer); + Assert.True(comparer.Equals(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y} == {yResult[i]}, {cb} == {cbResult[i]}, {cr} == {crResult[i]}"); } } From 1033297a37519b56729b7a5ba54259ba1fcb4de4 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 18:19:31 +0100 Subject: [PATCH 06/13] Add initial FMA resize kernel convolve implementation --- .../Transforms/Resize/ResizeKernel.cs | 58 +++++++++++++++---- 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index d94aeffe69..bff2c574a6 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -4,6 +4,10 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Processing.Processors.Transforms { @@ -66,21 +70,55 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms [MethodImpl(InliningOptions.ShortMethod)] public Vector4 ConvolveCore(ref Vector4 rowStartRef) { - ref float horizontalValues = ref Unsafe.AsRef(this.bufferPtr); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Fma.IsSupported) + { + float* bufferStart = this.bufferPtr; + float* bufferEnd = bufferStart + (this.Length & ~1); + Vector256 result256 = Vector256.Zero; - // Destination color components - Vector4 result = Vector4.Zero; + while (bufferStart < bufferEnd) + { + Vector256 rowItem256 = Unsafe.As>(ref rowStartRef); + var bufferItem256 = Vector256.Create(Vector128.Create(bufferStart[0]), Vector128.Create(bufferStart[1])); - for (int i = 0; i < this.Length; i++) - { - float weight = Unsafe.Add(ref horizontalValues, i); + result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256); + + bufferStart += 2; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); + } + + Vector128 result128 = Sse.Add(result256.GetLower(), result256.GetUpper()); + + if ((this.Length & 1) != 0) + { + Vector128 rowItem128 = Unsafe.As>(ref rowStartRef); + var bufferItem128 = Vector128.Create(*bufferStart); - // Vector4 v = offsetedRowSpan[i]; - Vector4 v = Unsafe.Add(ref rowStartRef, i); - result += v * weight; + result128 = Fma.MultiplyAdd(rowItem128, bufferItem128, result128); + } + + return *(Vector4*)&result128; } + else +#endif + { + // Destination color components + Vector4 result = Vector4.Zero; + float* bufferStart = this.bufferPtr; + float* bufferEnd = this.bufferPtr + this.Length; + + while (bufferStart < bufferEnd) + { + // Vector4 v = offsetedRowSpan[i]; + result += rowStartRef * *bufferStart; - return result; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 1); + bufferStart++; + } + + return result; + } } /// From c825eccd10f14eb733cdbe4c75656005afae5aed Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 19:14:51 +0100 Subject: [PATCH 07/13] Improved loading of factors using permutation Assembly for loading in the loop went from: ```asm vmovss xmm2, [rax] vbroadcastss xmm2, xmm2 vmovss xmm3, [rax+4] vbroadcastss xmm3, xmm3 vinsertf128 ymm2, ymm2, xmm3, 1 ``` To: ```asm vmovsd xmm3, [rax] vbroadcastsd ymm3, xmm3 vpermps ymm3, ymm1, ymm3 ``` --- .../Processing/Processors/Transforms/Resize/ResizeKernel.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index bff2c574a6..02027f42d8 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -76,11 +76,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms float* bufferStart = this.bufferPtr; float* bufferEnd = bufferStart + (this.Length & ~1); Vector256 result256 = Vector256.Zero; + var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1); while (bufferStart < bufferEnd) { Vector256 rowItem256 = Unsafe.As>(ref rowStartRef); - var bufferItem256 = Vector256.Create(Vector128.Create(bufferStart[0]), Vector128.Create(bufferStart[1])); + Vector256 bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask); result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256); From 1169e73915d98590e82d64f72fa3c2197e00aea9 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 19:33:15 +0100 Subject: [PATCH 08/13] Switch from FMA to AVX2 instructions --- .../Processors/Transforms/Resize/ResizeKernel.cs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index 02027f42d8..5a87d045ea 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms public Vector4 ConvolveCore(ref Vector4 rowStartRef) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Fma.IsSupported) + if (Avx2.IsSupported) { float* bufferStart = this.bufferPtr; float* bufferEnd = bufferStart + (this.Length & ~1); @@ -82,8 +82,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms { Vector256 rowItem256 = Unsafe.As>(ref rowStartRef); Vector256 bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask); + Vector256 multiply256 = Avx.Multiply(rowItem256, bufferItem256); - result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256); + result256 = Avx.Add(multiply256, result256); bufferStart += 2; rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); @@ -95,8 +96,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms { Vector128 rowItem128 = Unsafe.As>(ref rowStartRef); var bufferItem128 = Vector128.Create(*bufferStart); + Vector128 multiply128 = Sse.Multiply(rowItem128, bufferItem128); - result128 = Fma.MultiplyAdd(rowItem128, bufferItem128, result128); + result128 = Sse.Add(multiply128, result128); } return *(Vector4*)&result128; @@ -114,8 +116,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms // Vector4 v = offsetedRowSpan[i]; result += rowStartRef * *bufferStart; - rowStartRef = ref Unsafe.Add(ref rowStartRef, 1); bufferStart++; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 1); } return result; From 0e465cd8c30713b1c3c91966ebef855d4eda314d Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 22:58:43 +0100 Subject: [PATCH 09/13] Revert to FMA, codegen improvements --- .../Transforms/Resize/ResizeKernel.cs | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index 5a87d045ea..bd22864bb2 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms public Vector4 ConvolveCore(ref Vector4 rowStartRef) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) + if (Fma.IsSupported) { float* bufferStart = this.bufferPtr; float* bufferEnd = bufferStart + (this.Length & ~1); @@ -80,11 +80,20 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms while (bufferStart < bufferEnd) { - Vector256 rowItem256 = Unsafe.As>(ref rowStartRef); - Vector256 bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask); - Vector256 multiply256 = Avx.Multiply(rowItem256, bufferItem256); - - result256 = Avx.Add(multiply256, result256); + // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps + // for the FMA operation, and execute it directly on the target register and reading directly from + // memory for the first parameter. This skips initializing a SIMD register, and an extra copy. + // The code below should compile in the following assembly on .NET 5 x64: + // + // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _] + // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b] + // vfmadd231ps ymm0, ymm2, [r8] ; result256 = FMA(pixels, factors) + result256 + // + // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212. + result256 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), + result256); bufferStart += 2; rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); @@ -94,11 +103,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms if ((this.Length & 1) != 0) { - Vector128 rowItem128 = Unsafe.As>(ref rowStartRef); - var bufferItem128 = Vector128.Create(*bufferStart); - Vector128 multiply128 = Sse.Multiply(rowItem128, bufferItem128); - - result128 = Sse.Add(multiply128, result128); + result128 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Vector128.Create(*bufferStart), + result128); } return *(Vector4*)&result128; From e0b2defde22343414ee70babe21d1209fb760cbe Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 23:25:47 +0100 Subject: [PATCH 10/13] Add unrolled FMA loop --- .../Transforms/Resize/ResizeKernel.cs | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index bd22864bb2..b537cdfdf9 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -74,8 +74,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms if (Fma.IsSupported) { float* bufferStart = this.bufferPtr; - float* bufferEnd = bufferStart + (this.Length & ~1); - Vector256 result256 = Vector256.Zero; + float* bufferEnd = bufferStart + (this.Length & ~3); + Vector256 result256_0 = Vector256.Zero; + Vector256 result256_1 = Vector256.Zero; var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1); while (bufferStart < bufferEnd) @@ -87,19 +88,36 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms // // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _] // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b] - // vfmadd231ps ymm0, ymm2, [r8] ; result256 = FMA(pixels, factors) + result256 + // vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0 // // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212. - result256 = Fma.MultiplyAdd( + // Additionally, we're also unrolling two computations per each loop iterations to leverage the + // fact that most CPUs have two ports to schedule multiply operations for FMA instructions. + result256_0 = Fma.MultiplyAdd( Unsafe.As>(ref rowStartRef), Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), - result256); + result256_0); - bufferStart += 2; - rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); + result256_1 = Fma.MultiplyAdd( + Unsafe.As>(ref Unsafe.Add(ref rowStartRef, 2)), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask), + result256_1); + + bufferStart += 4; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 4); + } + + result256_0 = Avx.Add(result256_0, result256_1); + + if ((this.Length & 3) >= 2) + { + result256_0 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), + result256_0); } - Vector128 result128 = Sse.Add(result256.GetLower(), result256.GetUpper()); + Vector128 result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper()); if ((this.Length & 1) != 0) { From e68a21de52d1de7c9eaeb234ab50ec4cf470c2fc Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Tue, 19 Jan 2021 23:31:32 +0100 Subject: [PATCH 11/13] Add missing indexing update --- .../Processing/Processors/Transforms/Resize/ResizeKernel.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index b537cdfdf9..c79f938d73 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -115,6 +115,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms Unsafe.As>(ref rowStartRef), Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), result256_0); + + bufferStart += 2; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); } Vector128 result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper()); From ed4cfaa0ae4165357db4778da198189b8bc7d003 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Wed, 20 Jan 2021 17:39:12 +0100 Subject: [PATCH 12/13] Workaround for incorrect codegen on .NET 5 See Vector256.Create issue: https://github.com/dotnet/runtime/issues/47236 --- .../Processors/Transforms/Resize/ResizeKernel.cs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index c79f938d73..979206ad5c 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -5,6 +5,7 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; #if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; #endif @@ -77,7 +78,14 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms float* bufferEnd = bufferStart + (this.Length & ~3); Vector256 result256_0 = Vector256.Zero; Vector256 result256_1 = Vector256.Zero; - var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1); + ReadOnlySpan maskBytes = new byte[] + { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, + }; + Vector256 mask = Unsafe.ReadUnaligned>(ref MemoryMarshal.GetReference(maskBytes)); while (bufferStart < bufferEnd) { From 8c7019e41e9a9dfbba63af19859194471a08be3a Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Wed, 20 Jan 2021 21:50:35 +0100 Subject: [PATCH 13/13] Update image threshold for resize tests --- .../Processing/Processors/Transforms/ResizeTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs index f4a94782fd..58b7fd12e8 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs @@ -139,7 +139,7 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Transforms testOutputDetails: workingBufferLimitInRows, appendPixelTypeToFileName: false); image.CompareToReferenceOutput( - ImageComparer.TolerantPercentage(0.001f), + ImageComparer.TolerantPercentage(0.004f), provider, testOutputDetails: workingBufferLimitInRows, appendPixelTypeToFileName: false);