From 65fb6395be1ae7479733ccdd10eae708a9230c0f Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Thu, 12 Jan 2017 05:01:11 +0100 Subject: [PATCH] refactored RgbaToYcbCr benchmarks to operate on arrays + added Vector based experiments --- .../ImageSharp.Benchmarks/Color/RgbToYCbCr.cs | 379 ++++++++++++++++-- 1 file changed, 337 insertions(+), 42 deletions(-) diff --git a/tests/ImageSharp.Benchmarks/Color/RgbToYCbCr.cs b/tests/ImageSharp.Benchmarks/Color/RgbToYCbCr.cs index 33bc85da0..0c777b86b 100644 --- a/tests/ImageSharp.Benchmarks/Color/RgbToYCbCr.cs +++ b/tests/ImageSharp.Benchmarks/Color/RgbToYCbCr.cs @@ -1,81 +1,376 @@ namespace ImageSharp.Benchmarks { + using System; + using System.Buffers; using System.Numerics; + using System.Runtime.CompilerServices; using BenchmarkDotNet.Attributes; + using ImageSharp.Formats.Jpg; + public partial class RgbToYCbCr { + private const int InputColorCount = 64; + + private const int InputByteCount = InputColorCount * 3; + private static readonly Vector3 VectorY = new Vector3(0.299F, 0.587F, 0.114F); + private static readonly Vector3 VectorCb = new Vector3(-0.168736F, 0.331264F, 0.5F); + private static readonly Vector3 VectorCr = new Vector3(0.5F, 0.418688F, 0.081312F); + private static class ScaledCoeffs + { + public static readonly int[] Y = + { + 306, 601, 117, 0, + 306, 601, 117, 0, + }; + + public static readonly int[] Cb = + { + -172, 339, 512, 0, + -172, 339, 512, 0, + }; + + public static readonly int[] Cr = + { + 512, 429, 83, 0, + 512, 429, 83, 0, + }; + + public static class SelectLeft + { + public static readonly int[] Y = + { + 1, 1, 1, 0, + 0, 0, 0, 0, + }; + + public static readonly int[] Cb = + { + 1, -1, 1, 0, + 0, 0, 0, 0, + }; + + public static readonly int[] Cr = + { + 1, -1, -1, 0, + 0, 0, 0, 0, + }; + } + + public static class SelectRight + { + public static readonly int[] Y = + { + 0, 0, 0, 0, + 1, 1, 1, 0, + }; + + public static readonly int[] Cb = + { + 0, 0, 0, 0, + 1, -1, 1, 0, + }; + + public static readonly int[] Cr = + { + 0, 0, 0, 0, + 1, -1, -1, 0, + }; + } + } + + // Waiting for C# 7 stackalloc keyword patiently ... + private static class OnStackInputCache + { + public unsafe struct Byte + { + public fixed byte Data[InputByteCount * 3]; + + public static Byte Create(byte[] data) + { + Byte result = default(Byte); + for (int i = 0; i < data.Length; i++) + { + result.Data[i] = data[i]; + } + return result; + } + } + } + + public struct Result + { + internal Block8x8F Y; + internal Block8x8F Cb; + internal Block8x8F Cr; + } + + // The operation is defined as "RGBA -> YCbCr Transform a stream of bytes into a stream of floats" + // We need to benchmark the whole operation, to get true results, not missing any side effects! + private byte[] inputSourceRGB = null; + + private int[] inputSourceRGBAsInteger = null; + + [Setup] + public void Setup() + { + // Console.WriteLine("Vector.Count: " + Vector.Count); + this.inputSourceRGB = new byte[InputByteCount]; + for (int i = 0; i < this.inputSourceRGB.Length; i++) + { + this.inputSourceRGB[i] = (byte)(42 + i); + } + this.inputSourceRGBAsInteger = new int[InputByteCount + Vector.Count]; // Filling this should be part of the measured operation + } + [Benchmark(Baseline = true, Description = "Floating Point Conversion")] - public Vector3 RgbaToYcbCr() + public unsafe void RgbaToYcbCrScalarFloat() { - Vector3 v = new Vector3(255); + // Copy the input to the stack: + OnStackInputCache.Byte input = OnStackInputCache.Byte.Create(this.inputSourceRGB); - float yy = (0.299F * v.X) + (0.587F * v.Y) + (0.114F * v.Z); - float cb = 128 + ((-0.168736F * v.X) - (0.331264F * v.Y) + (0.5F * v.Z)); - float cr = 128 + ((0.5F * v.X) - (0.418688F * v.Y) - (0.081312F * v.Z)); + // On-stack output: + Result result = default(Result); + float* yPtr = (float*)&result.Y; + float* cbPtr = (float*)&result.Cb; + float* crPtr = (float*)&result.Cr; + // end of code-bloat block :) - return new Vector3(yy, cb, cr); + for (int i = 0; i < InputColorCount; i++) + { + int i3 = i * 3; + float r = input.Data[i3 + 0]; + float g = input.Data[i3 + 1]; + float b = input.Data[i3 + 2]; + + *yPtr++ = (0.299F * r) + (0.587F * g) + (0.114F * b); + *cbPtr++ = 128 + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); + *crPtr++ = 128 + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); + } } [Benchmark(Description = "Simd Floating Point Conversion")] - public Vector3 RgbaToYcbCrSimd() + public unsafe void RgbaToYcbCrSimdFloat() { - Vector3 vectorRgb = new Vector3(255); - Vector3 vectorY = VectorY * vectorRgb; - Vector3 vectorCb = VectorCb * vectorRgb; - Vector3 vectorCr = VectorCr * vectorRgb; + // Copy the input to the stack: + OnStackInputCache.Byte input = OnStackInputCache.Byte.Create(this.inputSourceRGB); + + // On-stack output: + Result result = default(Result); + float* yPtr = (float*)&result.Y; + float* cbPtr = (float*)&result.Cb; + float* crPtr = (float*)&result.Cr; + // end of code-bloat block :) + + for (int i = 0; i < InputColorCount; i++) + { + int i3 = i * 3; - float yy = vectorY.X + vectorY.Y + vectorY.Z; - float cb = 128 + (vectorCb.X - vectorCb.Y + vectorCb.Z); - float cr = 128 + (vectorCr.X - vectorCr.Y - vectorCr.Z); + Vector3 vectorRgb = new Vector3( + input.Data[i3 + 0], + input.Data[i3 + 1], + input.Data[i3 + 2] + ); - return new Vector3(yy, cb, cr); + Vector3 vectorY = VectorY * vectorRgb; + Vector3 vectorCb = VectorCb * vectorRgb; + Vector3 vectorCr = VectorCr * vectorRgb; + + // Should be better in theory, but came out to be worse: :( + // Vector3 c = new Vector3(0, 128, 128); + // Vector3 xx = new Vector3(vectorY.X, vectorCb.X, vectorCr.X); + // Vector3 yy = new Vector3(vectorY.Y, -vectorCb.Y, -vectorCr.Y); + // Vector3 zz = new Vector3(vectorY.Z, vectorCb.Z, -vectorCr.Z); + + // c += xx + yy + zz; + // *yPtr++ = c.X; + // *cbPtr++ = c.Y; + // *crPtr++ = c.Z; + + *yPtr++ = vectorY.X + vectorY.Y + vectorY.Z; + *cbPtr++ = 128 + (vectorCb.X - vectorCb.Y + vectorCb.Z); + *crPtr++ = 128 + (vectorCr.X - vectorCr.Y - vectorCr.Z); + } } - [Benchmark(Description = "Scaled Integer Conversion")] - public Vector3 RgbaToYcbCrScaled() + [Benchmark(Description = "Scaled Integer Conversion + Vector")] + public unsafe void RgbaToYcbCrScaledIntegerSimd() + { + // Copy the input to the stack: + + // On-stack output: + Result result = default(Result); + float* yPtr = (float*)&result.Y; + float* cbPtr = (float*)&result.Cb; + float* crPtr = (float*)&result.Cr; + // end of code-bloat block :) + + Vector yCoeffs = new Vector(ScaledCoeffs.Y); + Vector cbCoeffs = new Vector(ScaledCoeffs.Cb); + Vector crCoeffs = new Vector(ScaledCoeffs.Cr); + + for (int i = 0; i < this.inputSourceRGB.Length; i++) + { + this.inputSourceRGBAsInteger[i] = this.inputSourceRGB[i]; + } + + for (int i = 0; i < InputColorCount; i += 2) + { + Vector rgb = new Vector(this.inputSourceRGBAsInteger, i * 3); + + Vector y = yCoeffs * rgb; + Vector cb = cbCoeffs * rgb; + Vector cr = crCoeffs * rgb; + + *yPtr++ = (y[0] + y[1] + y[2]) >> 10; + *cbPtr++ = 128 + ((cb[0] - cb[1] + cb[2]) >> 10); + *crPtr++ = 128 + ((cr[0] - cr[1] - cr[2]) >> 10); + + *yPtr++ = (y[4] + y[5] + y[6]) >> 10; + *cbPtr++ = 128 + ((cb[4] - cb[5] + cb[6]) >> 10); + *crPtr++ = 128 + ((cr[4] - cr[5] - cr[6]) >> 10); + } + } + + /// + /// This should perform better. Coreclr emmitted Vector.Dot() code lacks the vectorization even with IsHardwareAccelerated == true. + /// Kept this benchmark because maybe it will be improved in a future CLR release. + /// + /// https://www.gamedev.net/topic/673396-c-systemnumericsvectors-slow/ + /// + /// + [Benchmark(Description = "Scaled Integer Conversion + Vector + Dot Product")] + public unsafe void RgbaToYcbCrScaledIntegerSimdWithDotProduct() { - int r = 255; - int g = 255; - int b = 255; + // Copy the input to the stack: + + // On-stack output: + Result result = default(Result); + float* yPtr = (float*)&result.Y; + float* cbPtr = (float*)&result.Cb; + float* crPtr = (float*)&result.Cr; + // end of code-bloat block :) + + Vector yCoeffs = new Vector(ScaledCoeffs.Y); + Vector cbCoeffs = new Vector(ScaledCoeffs.Cb); + Vector crCoeffs = new Vector(ScaledCoeffs.Cr); + + Vector leftY = new Vector(ScaledCoeffs.SelectLeft.Y); + Vector leftCb = new Vector(ScaledCoeffs.SelectLeft.Cb); + Vector leftCr = new Vector(ScaledCoeffs.SelectLeft.Cr); - // Scale by 1024, add .5F and truncate value - int y0 = 306 * r; // (0.299F * 1024) + .5F - int y1 = 601 * g; // (0.587F * 1024) + .5F - int y2 = 117 * b; // (0.114F * 1024) + .5F + Vector rightY = new Vector(ScaledCoeffs.SelectRight.Y); + Vector rightCb = new Vector(ScaledCoeffs.SelectRight.Cb); + Vector rightCr = new Vector(ScaledCoeffs.SelectRight.Cr); - int cb0 = -172 * r; // (-0.168736F * 1024) + .5F - int cb1 = 339 * g; // (0.331264F * 1024) + .5F - int cb2 = 512 * b; // (0.5F * 1024) + .5F + for (int i = 0; i < this.inputSourceRGB.Length; i++) + { + this.inputSourceRGBAsInteger[i] = this.inputSourceRGB[i]; + } - int cr0 = 512 * r; // (0.5F * 1024) + .5F - int cr1 = 429 * g; // (0.418688F * 1024) + .5F - int cr2 = 83 * b; // (0.081312F * 1024) + .5F + for (int i = 0; i < InputColorCount; i += 2) + { + Vector rgb = new Vector(this.inputSourceRGBAsInteger, i * 3); - float yy = (y0 + y1 + y2) >> 10; - float cb = 128 + ((cb0 - cb1 + cb2) >> 10); - float cr = 128 + ((cr0 - cr1 - cr2) >> 10); + Vector y = yCoeffs * rgb; + Vector cb = cbCoeffs * rgb; + Vector cr = crCoeffs * rgb; - return new Vector3(yy, cb, cr); + VectorizedConvertImpl(ref yPtr, ref cbPtr, ref crPtr, y, cb, cr, leftY, leftCb, leftCr); + VectorizedConvertImpl(ref yPtr, ref cbPtr, ref crPtr, y, cb, cr, rightY, rightCb, rightCr); + } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void VectorizedConvertImpl( + ref float* yPtr, + ref float* cbPtr, + ref float* crPtr, + Vector y, + Vector cb, + Vector cr, + Vector yAgg, + Vector cbAgg, + Vector crAgg) + { + int ySum = Vector.Dot(y, yAgg); + int cbSum = Vector.Dot(cb, cbAgg); + int crSum = Vector.Dot(cr, crAgg); + *yPtr++ = ySum >> 10; + *cbPtr++ = 128 + (cbSum >> 10); + *crPtr++ = 128 + (crSum >> 10); + } + + [Benchmark(Description = "Scaled Integer Conversion")] + public unsafe void RgbaToYcbCrScaledInteger() + { + // Copy the input to the stack: + OnStackInputCache.Byte input = OnStackInputCache.Byte.Create(this.inputSourceRGB); + + // On-stack output: + Result result = default(Result); + float* yPtr = (float*)&result.Y; + float* cbPtr = (float*)&result.Cb; + float* crPtr = (float*)&result.Cr; + // end of code-bloat block :) + + for (int i = 0; i < InputColorCount; i++) + { + int i3 = i * 3; + int r = input.Data[i3 + 0]; + int g = input.Data[i3 + 1]; + int b = input.Data[i3 + 2]; + + // Scale by 1024, add .5F and truncate value + int y0 = 306 * r; // (0.299F * 1024) + .5F + int y1 = 601 * g; // (0.587F * 1024) + .5F + int y2 = 117 * b; // (0.114F * 1024) + .5F + + int cb0 = -172 * r; // (-0.168736F * 1024) + .5F + int cb1 = 339 * g; // (0.331264F * 1024) + .5F + int cb2 = 512 * b; // (0.5F * 1024) + .5F + + int cr0 = 512 * r; // (0.5F * 1024) + .5F + int cr1 = 429 * g; // (0.418688F * 1024) + .5F + int cr2 = 83 * b; // (0.081312F * 1024) + .5F + + *yPtr++ = (y0 + y1 + y2) >> 10; + *cbPtr++ = 128 + ((cb0 - cb1 + cb2) >> 10); + *crPtr++ = 128 + ((cr0 - cr1 - cr2) >> 10); + } + } + [Benchmark(Description = "Scaled Integer LUT Conversion")] - public Vector3 RgbaToYcbCrScaledLut() + public unsafe void RgbaToYcbCrScaledIntegerLut() { - int r = 255; - int g = 255; - int b = 255; + // Copy the input to the stack: + OnStackInputCache.Byte input = OnStackInputCache.Byte.Create(this.inputSourceRGB); + + // On-stack output: + Result result = default(Result); + float* yPtr = (float*)&result.Y; + float* cbPtr = (float*)&result.Cb; + float* crPtr = (float*)&result.Cr; + // end of code-bloat block :) + + for (int i = 0; i < InputColorCount; i++) + { + int i3 = i * 3; - float yy = (LookupTables.Y0[r] + LookupTables.Y1[g] + LookupTables.Y2[b]) >> 10; - float cb = 128 + ((LookupTables.Cb0[r] - LookupTables.Cb1[g] + LookupTables.Cb2Cr0[b]) >> 10); - float cr = 128 + ((LookupTables.Cb2Cr0[r] - LookupTables.Cr1[g] - LookupTables.Cr2[b]) >> 10); + int r = input.Data[i3 + 0]; + int g = input.Data[i3 + 1]; + int b = input.Data[i3 + 2]; - return new Vector3(yy, cb, cr); + // TODO: Maybe concatenating all the arrays in LookupTables to a flat one can improve this! + *yPtr++ = (LookupTables.Y0[r] + LookupTables.Y1[g] + LookupTables.Y2[b]) >> 10; + *cbPtr++ = 128 + ((LookupTables.Cb0[r] - LookupTables.Cb1[g] + LookupTables.Cb2Cr0[b]) >> 10); + *crPtr++ = 128 + ((LookupTables.Cb2Cr0[r] - LookupTables.Cr1[g] - LookupTables.Cr2[b]) >> 10); + } } } } \ No newline at end of file