diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs similarity index 79% rename from src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs rename to src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs index 236eff27cc..3c1a02c5aa 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs @@ -1,16 +1,17 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using System.Runtime.CompilerServices; +using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { /// /// Provides 8-bit lookup tables for converting from Rgb to YCbCr colorspace. /// Methods to build the tables are based on libjpeg implementation. - /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)! /// - internal unsafe struct RgbToYCbCrTables + internal unsafe struct RgbToYCbCrConverterLut { /// /// The red luminance table @@ -63,10 +64,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// Initializes the YCbCr tables /// - /// The initialized - public static RgbToYCbCrTables Create() + /// The initialized + public static RgbToYCbCrConverterLut Create() { - RgbToYCbCrTables tables = default; + RgbToYCbCrConverterLut tables = default; for (int i = 0; i <= 255; i++) { @@ -92,11 +93,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } /// - /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)! /// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void ConvertPixelInto( + private void ConvertPixelInto( int r, int g, int b, @@ -111,10 +111,29 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits; - // float cr = MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero); + // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits; } + public void Convert(Span rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + ref Rgb24 rgbStart = ref rgbSpan[0]; + + for (int i = 0; i < 64; i++) + { + ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i); + + this.ConvertPixelInto( + c.R, + c.G, + c.B, + ref yBlock, + ref cbBlock, + ref crBlock, + i); + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Fix(float x) => (int)((x * (1L << ScaleBits)) + 0.5F); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs new file mode 100644 index 0000000000..209cc3c6ab --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -0,0 +1,120 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Diagnostics; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder +{ + internal static class RgbToYCbCrConverterVectorized + { + public static bool IsSupported + { + get + { +#if SUPPORTS_RUNTIME_INTRINSICS + return Avx2.IsSupported; +#else + return false; +#endif + } + } + +#if SUPPORTS_RUNTIME_INTRINSICS + private static ReadOnlySpan MoveFirst24BytesToSeparateLanes => new byte[] + { + 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, + 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 + }; + + private static ReadOnlySpan MoveLast24BytesToSeparateLanes => new byte[] + { + 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0 + }; + + private static ReadOnlySpan ExtractRgb => new byte[] + { + 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF + }; +#endif + + public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + Debug.Assert(IsSupported, "AVX2 is required to run this converter"); + +#if SUPPORTS_RUNTIME_INTRINSICS + var f0299 = Vector256.Create(0.299f); + var f0587 = Vector256.Create(0.587f); + var f0114 = Vector256.Create(0.114f); + var fn0168736 = Vector256.Create(-0.168736f); + var fn0331264 = Vector256.Create(-0.331264f); + var f128 = Vector256.Create(128f); + var fn0418688 = Vector256.Create(-0.418688f); + var fn0081312F = Vector256.Create(-0.081312F); + var f05 = Vector256.Create(0.5f); + var zero = Vector256.Create(0).AsByte(); + + ref Vector256 inRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); + ref Vector256 destYRef = ref Unsafe.As>(ref yBlock); + ref Vector256 destCbRef = ref Unsafe.As>(ref cbBlock); + ref Vector256 destCrRef = ref Unsafe.As>(ref crBlock); + + var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); + var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); + Vector256 rgb, rg, bx; + Vector256 r, g, b; + for (int i = 0; i < 7; i++) + { + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte(); + + rgb = Avx2.Shuffle(rgb, extractRgbMask); + + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); + + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref destYRef, i) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); + } + + extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes)); + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte(); + rgb = Avx2.Shuffle(rgb, extractRgbMask); + + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); + + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); +#endif + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs index 4d6186e22f..8fcc63c6aa 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs @@ -2,7 +2,6 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Runtime.CompilerServices; using SixLabors.ImageSharp.Advanced; using SixLabors.ImageSharp.PixelFormats; @@ -33,7 +32,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// The color conversion tables /// - private RgbToYCbCrTables colorTables; + private RgbToYCbCrConverterLut colorTables; /// /// Temporal 8x8 block to hold TPixel data @@ -48,7 +47,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public static YCbCrForwardConverter Create() { var result = default(YCbCrForwardConverter); - result.colorTables = RgbToYCbCrTables.Create(); + if (!RgbToYCbCrConverterVectorized.IsSupported) + { + // Avoid creating lookup tables, when vectorized converter is supported + result.colorTables = RgbToYCbCrConverterLut.Create(); + } + return result; } @@ -65,20 +69,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref Block8x8F yBlock = ref this.Y; ref Block8x8F cbBlock = ref this.Cb; ref Block8x8F crBlock = ref this.Cr; - ref Rgb24 rgbStart = ref rgbSpan[0]; - for (int i = 0; i < 64; i++) + if (RgbToYCbCrConverterVectorized.IsSupported) { - ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i); - - this.colorTables.ConvertPixelInto( - c.R, - c.G, - c.B, - ref yBlock, - ref cbBlock, - ref crBlock, - i); + RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + } + else + { + this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); } } } diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index d94aeffe69..979206ad5c 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -4,6 +4,11 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Processing.Processors.Transforms { @@ -66,21 +71,94 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms [MethodImpl(InliningOptions.ShortMethod)] public Vector4 ConvolveCore(ref Vector4 rowStartRef) { - ref float horizontalValues = ref Unsafe.AsRef(this.bufferPtr); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Fma.IsSupported) + { + float* bufferStart = this.bufferPtr; + float* bufferEnd = bufferStart + (this.Length & ~3); + Vector256 result256_0 = Vector256.Zero; + Vector256 result256_1 = Vector256.Zero; + ReadOnlySpan maskBytes = new byte[] + { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, + 1, 0, 0, 0, 1, 0, 0, 0, + }; + Vector256 mask = Unsafe.ReadUnaligned>(ref MemoryMarshal.GetReference(maskBytes)); - // Destination color components - Vector4 result = Vector4.Zero; + while (bufferStart < bufferEnd) + { + // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps + // for the FMA operation, and execute it directly on the target register and reading directly from + // memory for the first parameter. This skips initializing a SIMD register, and an extra copy. + // The code below should compile in the following assembly on .NET 5 x64: + // + // vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _] + // vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b] + // vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0 + // + // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212. + // Additionally, we're also unrolling two computations per each loop iterations to leverage the + // fact that most CPUs have two ports to schedule multiply operations for FMA instructions. + result256_0 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), + result256_0); - for (int i = 0; i < this.Length; i++) - { - float weight = Unsafe.Add(ref horizontalValues, i); + result256_1 = Fma.MultiplyAdd( + Unsafe.As>(ref Unsafe.Add(ref rowStartRef, 2)), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask), + result256_1); + + bufferStart += 4; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 4); + } + + result256_0 = Avx.Add(result256_0, result256_1); + + if ((this.Length & 3) >= 2) + { + result256_0 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask), + result256_0); + + bufferStart += 2; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 2); + } - // Vector4 v = offsetedRowSpan[i]; - Vector4 v = Unsafe.Add(ref rowStartRef, i); - result += v * weight; + Vector128 result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper()); + + if ((this.Length & 1) != 0) + { + result128 = Fma.MultiplyAdd( + Unsafe.As>(ref rowStartRef), + Vector128.Create(*bufferStart), + result128); + } + + return *(Vector4*)&result128; } + else +#endif + { + // Destination color components + Vector4 result = Vector4.Zero; + float* bufferStart = this.bufferPtr; + float* bufferEnd = this.bufferPtr + this.Length; - return result; + while (bufferStart < bufferEnd) + { + // Vector4 v = offsetedRowSpan[i]; + result += rowStartRef * *bufferStart; + + bufferStart++; + rowStartRef = ref Unsafe.Add(ref rowStartRef, 1); + } + + return result; + } } /// diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs new file mode 100644 index 0000000000..1db4072932 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs @@ -0,0 +1,56 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Formats.Jpeg.Components; +using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder +{ + public class YCbCrForwardConverterBenchmark + { + private RgbToYCbCrConverterLut converter; + private Rgb24[] data; + + [GlobalSetup] + public void Setup() + { + this.converter = RgbToYCbCrConverterLut.Create(); + + var r = new Random(42); + this.data = new Rgb24[64]; + + var d = new byte[3]; + for (int i = 0; i < this.data.Length; i++) + { + r.NextBytes(d); + this.data[i] = new Rgb24(d[0], d[1], d[2]); + } + } + + [Benchmark(Baseline = true)] + public void ConvertLut() + { + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr); + } + + [Benchmark] + public void ConvertVectorized() + { + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + if (RgbToYCbCrConverterVectorized.IsSupported) + { + RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr); + } + } + } +} diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs new file mode 100644 index 0000000000..9a6fc8d6fd --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs @@ -0,0 +1,91 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using SixLabors.ImageSharp.ColorSpaces; +using SixLabors.ImageSharp.Formats.Jpeg.Components; +using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder; +using SixLabors.ImageSharp.PixelFormats; +using SixLabors.ImageSharp.Tests.Colorspaces.Conversion; +using Xunit; +using Xunit.Abstractions; + +// ReSharper disable InconsistentNaming +namespace SixLabors.ImageSharp.Tests.Formats.Jpg +{ + public class RgbToYCbCrConverterTests + { + public RgbToYCbCrConverterTests(ITestOutputHelper output) + { + this.Output = output; + } + + private ITestOutputHelper Output { get; } + + [Fact] + public void TestLutConverter() + { + Rgb24[] data = CreateTestData(); + var target = RgbToYCbCrConverterLut.Create(); + + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + target.Convert(data.AsSpan(), ref y, ref cb, ref cr); + + Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F)); + } + + [Fact] + public void TestVectorizedConverter() + { + if (!RgbToYCbCrConverterVectorized.IsSupported) + { + this.Output.WriteLine("No AVX and/or FMA present, skipping test!"); + return; + } + + Rgb24[] data = CreateTestData(); + + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr); + + Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F)); + } + + private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer) + { + for (int i = 0; i < data.Length; i++) + { + int r = data[i].R; + int g = data[i].G; + int b = data[i].B; + + float y = (0.299F * r) + (0.587F * g) + (0.114F * b); + float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); + float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); + + Assert.True(comparer.Equals(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y} == {yResult[i]}, {cb} == {cbResult[i]}, {cr} == {crResult[i]}"); + } + } + + private static Rgb24[] CreateTestData() + { + var data = new Rgb24[64]; + var r = new Random(); + + var random = new byte[3]; + for (int i = 0; i < data.Length; i++) + { + r.NextBytes(random); + data[i] = new Rgb24(random[0], random[1], random[2]); + } + + return data; + } + } +} diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs index f4a94782fd..58b7fd12e8 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs @@ -139,7 +139,7 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Transforms testOutputDetails: workingBufferLimitInRows, appendPixelTypeToFileName: false); image.CompareToReferenceOutput( - ImageComparer.TolerantPercentage(0.001f), + ImageComparer.TolerantPercentage(0.004f), provider, testOutputDetails: workingBufferLimitInRows, appendPixelTypeToFileName: false);