From 65fb6395be1ae7479733ccdd10eae708a9230c0f Mon Sep 17 00:00:00 2001
From: Anton Firszov <anton.firszov@sapagroup.com>
Date: Thu, 12 Jan 2017 05:01:11 +0100
Subject: [PATCH] refactored RgbaToYcbCr benchmarks to operate on arrays +
 added Vector<int> based experiments

---
 .../ImageSharp.Benchmarks/Color/RgbToYCbCr.cs | 379 ++++++++++++++++--
 1 file changed, 337 insertions(+), 42 deletions(-)
diff --git a/tests/ImageSharp.Benchmarks/Color/RgbToYCbCr.cs b/tests/ImageSharp.Benchmarks/Color/RgbToYCbCr.cs
index 33bc85da0..0c777b86b 100644
--- a/tests/ImageSharp.Benchmarks/Color/RgbToYCbCr.cs
+++ b/tests/ImageSharp.Benchmarks/Color/RgbToYCbCr.cs
@@ -1,81 +1,376 @@
 ﻿namespace ImageSharp.Benchmarks
 {
+    using System;
+    using System.Buffers;
     using System.Numerics;
+    using System.Runtime.CompilerServices;
 
     using BenchmarkDotNet.Attributes;
 
+    using ImageSharp.Formats.Jpg;
+
     public partial class RgbToYCbCr
     {
+        private const int InputColorCount = 64;
+
+        private const int InputByteCount = InputColorCount * 3;
+
         private static readonly Vector3 VectorY = new Vector3(0.299F, 0.587F, 0.114F);
+
         private static readonly Vector3 VectorCb = new Vector3(-0.168736F, 0.331264F, 0.5F);
+
         private static readonly Vector3 VectorCr = new Vector3(0.5F, 0.418688F, 0.081312F);
 
+        private static class ScaledCoeffs
+        {
+            public static readonly int[] Y =
+                {
+                    306, 601, 117, 0,
+                    306, 601, 117, 0,
+                };
+
+            public static readonly int[] Cb =
+                {
+                    -172, 339, 512, 0,
+                    -172, 339, 512, 0,
+                };
+
+            public static readonly int[] Cr =
+                {
+                    512, 429, 83, 0,
+                    512, 429, 83, 0,
+                };
+
+            public static class SelectLeft
+            {
+                public static readonly int[] Y =
+                {
+                    1, 1, 1, 0,
+                    0, 0, 0, 0,
+                };
+
+                public static readonly int[] Cb =
+                {
+                    1, -1, 1, 0,
+                    0, 0, 0, 0,
+                };
+
+                public static readonly int[] Cr =
+                {
+                    1, -1, -1, 0,
+                    0, 0, 0, 0,
+                };
+            }
+
+            public static class SelectRight
+            {
+                public static readonly int[] Y =
+                {
+                    0, 0, 0, 0,
+                    1, 1, 1, 0,
+                };
+
+                public static readonly int[] Cb =
+                {
+                    0, 0, 0, 0,
+                    1, -1, 1, 0,
+                };
+
+                public static readonly int[] Cr =
+                {
+                    0, 0, 0, 0,
+                    1, -1, -1, 0,
+                };
+            }
+        }
+
+        // Waiting for C# 7 stackalloc keyword patiently ...
+        private static class OnStackInputCache
+        {
+            public unsafe struct Byte
+            {
+                public fixed byte Data[InputByteCount * 3];
+
+                public static Byte Create(byte[] data)
+                {
+                    Byte result = default(Byte);
+                    for (int i = 0; i < data.Length; i++)
+                    {
+                        result.Data[i] = data[i];
+                    }
+                    return result;
+                }
+            }
+        }
+        
+        public struct Result
+        {
+            internal Block8x8F Y;
+            internal Block8x8F Cb;
+            internal Block8x8F Cr;
+        }
+        
+        // The operation is defined as "RGBA -> YCbCr Transform a stream of bytes into a stream of floats"
+        // We need to benchmark the whole operation, to get true results, not missing any side effects!
+        private byte[] inputSourceRGB = null;
+
+        private int[] inputSourceRGBAsInteger = null;
+
+        [Setup]
+        public void Setup()
+        {
+            // Console.WriteLine("Vector<int>.Count: " + Vector<int>.Count);
+            this.inputSourceRGB = new byte[InputByteCount];
+            for (int i = 0; i < this.inputSourceRGB.Length; i++)
+            {
+                this.inputSourceRGB[i] = (byte)(42 + i);
+            }
+            this.inputSourceRGBAsInteger = new int[InputByteCount + Vector<int>.Count]; // Filling this should be part of the measured operation
+        }
+
         [Benchmark(Baseline = true, Description = "Floating Point Conversion")]
-        public Vector3 RgbaToYcbCr()
+        public unsafe void RgbaToYcbCrScalarFloat()
         {
-            Vector3 v = new Vector3(255);
+            // Copy the input to the stack:
+            OnStackInputCache.Byte input = OnStackInputCache.Byte.Create(this.inputSourceRGB);
 
-            float yy = (0.299F * v.X) + (0.587F * v.Y) + (0.114F * v.Z);
-            float cb = 128 + ((-0.168736F * v.X) - (0.331264F * v.Y) + (0.5F * v.Z));
-            float cr = 128 + ((0.5F * v.X) - (0.418688F * v.Y) - (0.081312F * v.Z));
+            // On-stack output:
+            Result result = default(Result);
+            float* yPtr = (float*)&result.Y;
+            float* cbPtr = (float*)&result.Cb;
+            float* crPtr = (float*)&result.Cr;
+            // end of code-bloat block :)
 
-            return new Vector3(yy, cb, cr);
+            for (int i = 0; i < InputColorCount; i++)
+            {
+                int i3 = i * 3;
+                float r = input.Data[i3 + 0];
+                float g = input.Data[i3 + 1];
+                float b = input.Data[i3 + 2];
+
+                *yPtr++ = (0.299F * r) + (0.587F * g) + (0.114F * b);
+                *cbPtr++ = 128 + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
+                *crPtr++ = 128 + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
+            }
         }
 
         [Benchmark(Description = "Simd Floating Point Conversion")]
-        public Vector3 RgbaToYcbCrSimd()
+        public unsafe void RgbaToYcbCrSimdFloat()
         {
-            Vector3 vectorRgb = new Vector3(255);
-            Vector3 vectorY = VectorY * vectorRgb;
-            Vector3 vectorCb = VectorCb * vectorRgb;
-            Vector3 vectorCr = VectorCr * vectorRgb;
+            // Copy the input to the stack:
+            OnStackInputCache.Byte input = OnStackInputCache.Byte.Create(this.inputSourceRGB);
+
+            // On-stack output:
+            Result result = default(Result);
+            float* yPtr = (float*)&result.Y;
+            float* cbPtr = (float*)&result.Cb;
+            float* crPtr = (float*)&result.Cr;
+            // end of code-bloat block :)
+
+            for (int i = 0; i < InputColorCount; i++)
+            {
+                int i3 = i * 3;
 
-            float yy = vectorY.X + vectorY.Y + vectorY.Z;
-            float cb = 128 + (vectorCb.X - vectorCb.Y + vectorCb.Z);
-            float cr = 128 + (vectorCr.X - vectorCr.Y - vectorCr.Z);
+                Vector3 vectorRgb = new Vector3(
+                    input.Data[i3 + 0],
+                    input.Data[i3 + 1],
+                    input.Data[i3 + 2]
+                    );
 
-            return new Vector3(yy, cb, cr);
+                Vector3 vectorY = VectorY * vectorRgb;
+                Vector3 vectorCb = VectorCb * vectorRgb;
+                Vector3 vectorCr = VectorCr * vectorRgb;
+
+                // Should be better in theory, but came out to be worse: :(
+                // Vector3 c = new Vector3(0, 128, 128);
+                // Vector3 xx = new Vector3(vectorY.X, vectorCb.X, vectorCr.X);
+                // Vector3 yy = new Vector3(vectorY.Y, -vectorCb.Y, -vectorCr.Y);
+                // Vector3 zz = new Vector3(vectorY.Z, vectorCb.Z, -vectorCr.Z);
+
+                // c += xx + yy + zz;
+                // *yPtr++ = c.X;
+                // *cbPtr++ = c.Y;
+                // *crPtr++ = c.Z;
+
+                *yPtr++ = vectorY.X + vectorY.Y + vectorY.Z;
+                *cbPtr++ = 128 + (vectorCb.X - vectorCb.Y + vectorCb.Z);
+                *crPtr++ = 128 + (vectorCr.X - vectorCr.Y - vectorCr.Z);
+            }
         }
 
-        [Benchmark(Description = "Scaled Integer Conversion")]
-        public Vector3 RgbaToYcbCrScaled()
+        [Benchmark(Description = "Scaled Integer Conversion + Vector<int>")]
+        public unsafe void RgbaToYcbCrScaledIntegerSimd()
+        {
+            // Copy the input to the stack:
+
+            // On-stack output:
+            Result result = default(Result);
+            float* yPtr = (float*)&result.Y;
+            float* cbPtr = (float*)&result.Cb;
+            float* crPtr = (float*)&result.Cr;
+            // end of code-bloat block :)
+            
+            Vector<int> yCoeffs = new Vector<int>(ScaledCoeffs.Y);
+            Vector<int> cbCoeffs = new Vector<int>(ScaledCoeffs.Cb);
+            Vector<int> crCoeffs = new Vector<int>(ScaledCoeffs.Cr);
+            
+            for (int i = 0; i < this.inputSourceRGB.Length; i++)
+            {
+                this.inputSourceRGBAsInteger[i] = this.inputSourceRGB[i];
+            }
+
+            for (int i = 0; i < InputColorCount; i += 2)
+            {
+                Vector<int> rgb = new Vector<int>(this.inputSourceRGBAsInteger, i * 3);
+
+                Vector<int> y = yCoeffs * rgb;
+                Vector<int> cb = cbCoeffs * rgb;
+                Vector<int> cr = crCoeffs * rgb;
+                
+                *yPtr++ = (y[0] + y[1] + y[2]) >> 10;
+                *cbPtr++ = 128 + ((cb[0] - cb[1] + cb[2]) >> 10);
+                *crPtr++ = 128 + ((cr[0] - cr[1] - cr[2]) >> 10);
+
+                *yPtr++ = (y[4] + y[5] + y[6]) >> 10;
+                *cbPtr++ = 128 + ((cb[4] - cb[5] + cb[6]) >> 10);
+                *crPtr++ = 128 + ((cr[4] - cr[5] - cr[6]) >> 10);
+            }
+        }
+
+        /// <summary>
+        /// This should perform better. Coreclr emmitted Vector.Dot() code lacks the vectorization even with IsHardwareAccelerated == true.
+        /// Kept this benchmark because maybe it will be improved in a future CLR release.
+        /// <see>
+        ///     <cref>https://www.gamedev.net/topic/673396-c-systemnumericsvectors-slow/</cref>
+        /// </see>
+        /// </summary>
+        [Benchmark(Description = "Scaled Integer Conversion + Vector<int> + Dot Product")]
+        public unsafe void RgbaToYcbCrScaledIntegerSimdWithDotProduct()
         {
-            int r = 255;
-            int g = 255;
-            int b = 255;
+            // Copy the input to the stack:
+
+            // On-stack output:
+            Result result = default(Result);
+            float* yPtr = (float*)&result.Y;
+            float* cbPtr = (float*)&result.Cb;
+            float* crPtr = (float*)&result.Cr;
+            // end of code-bloat block :)
+
+            Vector<int> yCoeffs = new Vector<int>(ScaledCoeffs.Y);
+            Vector<int> cbCoeffs = new Vector<int>(ScaledCoeffs.Cb);
+            Vector<int> crCoeffs = new Vector<int>(ScaledCoeffs.Cr);
+
+            Vector<int> leftY = new Vector<int>(ScaledCoeffs.SelectLeft.Y);
+            Vector<int> leftCb = new Vector<int>(ScaledCoeffs.SelectLeft.Cb);
+            Vector<int> leftCr = new Vector<int>(ScaledCoeffs.SelectLeft.Cr);
 
-            // Scale by 1024, add .5F and truncate value
-            int y0 = 306 * r; // (0.299F * 1024) + .5F
-            int y1 = 601 * g; // (0.587F * 1024) + .5F
-            int y2 = 117 * b; // (0.114F * 1024) + .5F
+            Vector<int> rightY = new Vector<int>(ScaledCoeffs.SelectRight.Y);
+            Vector<int> rightCb = new Vector<int>(ScaledCoeffs.SelectRight.Cb);
+            Vector<int> rightCr = new Vector<int>(ScaledCoeffs.SelectRight.Cr);
 
-            int cb0 = -172 * r; // (-0.168736F * 1024) + .5F
-            int cb1 = 339 * g; // (0.331264F * 1024) + .5F
-            int cb2 = 512 * b; // (0.5F * 1024) + .5F
+            for (int i = 0; i < this.inputSourceRGB.Length; i++)
+            {
+                this.inputSourceRGBAsInteger[i] = this.inputSourceRGB[i];
+            }
 
-            int cr0 = 512 * r; // (0.5F * 1024) + .5F
-            int cr1 = 429 * g; // (0.418688F * 1024) + .5F
-            int cr2 = 83 * b; // (0.081312F * 1024) + .5F
+            for (int i = 0; i < InputColorCount; i += 2)
+            {
+                Vector<int> rgb = new Vector<int>(this.inputSourceRGBAsInteger, i * 3);
 
-            float yy = (y0 + y1 + y2) >> 10;
-            float cb = 128 + ((cb0 - cb1 + cb2) >> 10);
-            float cr = 128 + ((cr0 - cr1 - cr2) >> 10);
+                Vector<int> y = yCoeffs * rgb;
+                Vector<int> cb = cbCoeffs * rgb;
+                Vector<int> cr = crCoeffs * rgb;
 
-            return new Vector3(yy, cb, cr);
+                VectorizedConvertImpl(ref yPtr, ref cbPtr, ref crPtr, y, cb, cr, leftY, leftCb, leftCr);
+                VectorizedConvertImpl(ref yPtr, ref cbPtr, ref crPtr, y, cb, cr, rightY, rightCb, rightCr);
+            }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void VectorizedConvertImpl(
+            ref float* yPtr,
+            ref float* cbPtr,
+            ref float* crPtr,
+            Vector<int> y,
+            Vector<int> cb,
+            Vector<int> cr,
+            Vector<int> yAgg,
+            Vector<int> cbAgg,
+            Vector<int> crAgg)
+        {
+            int ySum = Vector.Dot(y, yAgg);
+            int cbSum = Vector.Dot(cb, cbAgg);
+            int crSum = Vector.Dot(cr, crAgg);
+            *yPtr++ = ySum >> 10;
+            *cbPtr++ = 128 + (cbSum >> 10);
+            *crPtr++ = 128 + (crSum >> 10);
+        }
+
+        [Benchmark(Description = "Scaled Integer Conversion")]
+        public unsafe void RgbaToYcbCrScaledInteger()
+        {
+            // Copy the input to the stack:
+            OnStackInputCache.Byte input = OnStackInputCache.Byte.Create(this.inputSourceRGB);
+
+            // On-stack output:
+            Result result = default(Result);
+            float* yPtr = (float*)&result.Y;
+            float* cbPtr = (float*)&result.Cb;
+            float* crPtr = (float*)&result.Cr;
+            // end of code-bloat block :)
+
+            for (int i = 0; i < InputColorCount; i++)
+            {
+                int i3 = i * 3;
+                int r = input.Data[i3 + 0];
+                int g = input.Data[i3 + 1];
+                int b = input.Data[i3 + 2];
+
+                // Scale by 1024, add .5F and truncate value
+                int y0 = 306 * r; // (0.299F * 1024) + .5F
+                int y1 = 601 * g; // (0.587F * 1024) + .5F
+                int y2 = 117 * b; // (0.114F * 1024) + .5F
+
+                int cb0 = -172 * r; // (-0.168736F * 1024) + .5F
+                int cb1 = 339 * g; // (0.331264F * 1024) + .5F
+                int cb2 = 512 * b; // (0.5F * 1024) + .5F
+
+                int cr0 = 512 * r; // (0.5F * 1024) + .5F
+                int cr1 = 429 * g; // (0.418688F * 1024) + .5F
+                int cr2 = 83 * b; // (0.081312F * 1024) + .5F
+
+                *yPtr++ = (y0 + y1 + y2) >> 10;
+                *cbPtr++ = 128 + ((cb0 - cb1 + cb2) >> 10);
+                *crPtr++ = 128 + ((cr0 - cr1 - cr2) >> 10);
+            }
+        }
+        
         [Benchmark(Description = "Scaled Integer LUT Conversion")]
-        public Vector3 RgbaToYcbCrScaledLut()
+        public unsafe void RgbaToYcbCrScaledIntegerLut()
         {
-            int r = 255;
-            int g = 255;
-            int b = 255;
+            // Copy the input to the stack:
+            OnStackInputCache.Byte input = OnStackInputCache.Byte.Create(this.inputSourceRGB);
+
+            // On-stack output:
+            Result result = default(Result);
+            float* yPtr = (float*)&result.Y;
+            float* cbPtr = (float*)&result.Cb;
+            float* crPtr = (float*)&result.Cr;
+            // end of code-bloat block :)
+
+            for (int i = 0; i < InputColorCount; i++)
+            {
+                int i3 = i * 3;
 
-            float yy = (LookupTables.Y0[r] + LookupTables.Y1[g] + LookupTables.Y2[b]) >> 10;
-            float cb = 128 + ((LookupTables.Cb0[r] - LookupTables.Cb1[g] + LookupTables.Cb2Cr0[b]) >> 10);
-            float cr = 128 + ((LookupTables.Cb2Cr0[r] - LookupTables.Cr1[g] - LookupTables.Cr2[b]) >> 10);
+                int r = input.Data[i3 + 0];
+                int g = input.Data[i3 + 1];
+                int b = input.Data[i3 + 2];
 
-            return new Vector3(yy, cb, cr);
+                // TODO: Maybe concatenating all the arrays in LookupTables to a flat one can improve this!
+                *yPtr++ = (LookupTables.Y0[r] + LookupTables.Y1[g] + LookupTables.Y2[b]) >> 10;
+                *cbPtr++ = 128 + ((LookupTables.Cb0[r] - LookupTables.Cb1[g] + LookupTables.Cb2Cr0[b]) >> 10);
+                *crPtr++ = 128 + ((LookupTables.Cb2Cr0[r] - LookupTables.Cr1[g] - LookupTables.Cr2[b]) >> 10);
+            }
         }
     }
 }
\ No newline at end of file