From efd4d22665239b098aa1ede45231b6ed59586b64 Mon Sep 17 00:00:00 2001
From: Nicolas Portmann <ndportmann@gmail.com>
Date: Sun, 17 Jan 2021 13:54:36 +0100
Subject: [PATCH 01/13] Add initial vectorized implementation with benchmarks

---
 ...bCrTables.cs => RgbToYCbCrConverterLut.cs} |  35 +++-
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 182 ++++++++++++++++++
 .../Encoder/YCbCrForwardConverter{TPixel}.cs  |  28 ++-
 .../Encoder/YCbCrForwardConverterBenchmark.cs |  56 ++++++
 4 files changed, 278 insertions(+), 23 deletions(-)
 rename src/ImageSharp/Formats/Jpeg/Components/Encoder/{RgbToYCbCrTables.cs => RgbToYCbCrConverterLut.cs} (79%)
 create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
 create mode 100644 tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
similarity index 79%
rename from src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs
rename to src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 236eff27cc..835a34f652 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -1,16 +1,17 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System;
 using System.Runtime.CompilerServices;
+using SixLabors.ImageSharp.PixelFormats;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
     /// <summary>
     /// Provides 8-bit lookup tables for converting from Rgb to YCbCr colorspace.
     /// Methods to build the tables are based on libjpeg implementation.
-    /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)!
     /// </summary>
-    internal unsafe struct RgbToYCbCrTables
+    internal unsafe struct RgbToYCbCrConverterLut
     {
         /// <summary>
         /// The red luminance table
@@ -63,10 +64,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// Initializes the YCbCr tables
         /// </summary>
-        /// <returns>The initialized <see cref="RgbToYCbCrTables"/></returns>
-        public static RgbToYCbCrTables Create()
+        /// <returns>The initialized <see cref="RgbToYCbCrConverterLut"/></returns>
+        public static RgbToYCbCrConverterLut Create()
         {
-            RgbToYCbCrTables tables = default;
+            RgbToYCbCrConverterLut tables = default;
 
             for (int i = 0; i <= 255; i++)
             {
@@ -92,11 +93,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         }
 
         /// <summary>
-        /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)!
         /// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public void ConvertPixelInto(
+        private void ConvertPixelInto(
             int r,
             int g,
             int b,
@@ -111,10 +111,29 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
             cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
 
-            // float cr = MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero);
+            // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
             crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
         }
 
+        public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+        {
+            ref Rgb24 rgbStart = ref rgbSpan[0];
+
+            for (int i = 0; i < 64; i++)
+            {
+                ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
+
+                this.ConvertPixelInto(
+                    c.R,
+                    c.G,
+                    c.B,
+                    ref yBlock,
+                    ref cbBlock,
+                    ref crBlock,
+                    i);
+            }
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static int Fix(float x)
             => (int)((x * (1L << ScaleBits)) + 0.5F);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
new file mode 100644
index 0000000000..068c3db964
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -0,0 +1,182 @@
+﻿// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Diagnostics;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+    internal static class RgbToYCbCrConverterVectorized
+    {
+        private static ReadOnlySpan<byte> ExtractionMasks => new byte[]
+        {
+            0x0, 0xFF, 0xFF, 0xFF, 0x1, 0xFF, 0xFF, 0xFF, 0x2, 0xFF, 0xFF, 0xFF, 0x3, 0xFF, 0xFF, 0xFF,   0x10, 0xFF, 0xFF, 0xFF, 0x11, 0xFF, 0xFF, 0xFF, 0x12, 0xFF, 0xFF, 0xFF, 0x13, 0xFF, 0xFF, 0xFF,
+            0x4, 0xFF, 0xFF, 0xFF, 0x5, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,   0x14, 0xFF, 0xFF, 0xFF, 0x15, 0xFF, 0xFF, 0xFF, 0x16, 0xFF, 0xFF, 0xFF, 0x17, 0xFF, 0xFF, 0xFF,
+            0x8, 0xFF, 0xFF, 0xFF, 0x9, 0xFF, 0xFF, 0xFF, 0xA, 0xFF, 0xFF, 0xFF, 0xB, 0xFF, 0xFF, 0xFF,   0x18, 0xFF, 0xFF, 0xFF, 0x19, 0xFF, 0xFF, 0xFF, 0x1A, 0xFF, 0xFF, 0xFF, 0x1B, 0xFF, 0xFF, 0xFF,
+            0xC, 0xFF, 0xFF, 0xFF, 0xD, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,   0x1C, 0xFF, 0xFF, 0xFF, 0x1D, 0xFF, 0xFF, 0xFF, 0x1E, 0xFF, 0xFF, 0xFF, 0x1F, 0xFF, 0xFF, 0xFF,
+        };
+
+        public static bool IsSupported
+        {
+            get
+            {
+#if SUPPORTS_RUNTIME_INTRINSICS
+                return Avx2.IsSupported && Fma.IsSupported;
+#else
+                return false;
+#endif
+            }
+        }
+
+        public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+        {
+            Debug.Assert(IsSupported, "AVX2 and FMA are required to run this converter");
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+            SeparateRgb(rgbSpan);
+            ConvertInternal(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+#endif
+        }
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        /// <summary>
+        /// Rearranges the provided <paramref name="rgbSpan"/> in-place
+        /// from { r00, g00, b00, ..., r63, g63, b63 }
+        /// to { r00, ... r31, g00, ..., g31, b00, ..., b31,
+        ///      r32, ... r63, g32, ..., g63, b31, ..., b63 }
+        /// </summary>
+        /// <remarks>
+        /// SSE is used for this operation as it is significantly faster than AVX in this specific case.
+        /// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers.
+        /// </remarks>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static void SeparateRgb(ReadOnlySpan<Rgb24> rgbSpan)
+        {
+            var selectRed0 = Vector128.Create(0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+            var selectRed1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+            var selectRed2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D);
+
+            var selectGreen0 = Vector128.Create(0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+            var selectGreen1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+            var selectGreen2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E);
+
+            var selectBlue0 = Vector128.Create(0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+            var selectBlue1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+            var selectBlue2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F);
+
+            for (int i = 0; i < 2; i++)
+            {
+                ref Vector128<byte> inRef = ref Unsafe.Add(ref Unsafe.As<Rgb24, Vector128<byte>>(ref MemoryMarshal.GetReference(rgbSpan)), i * 6);
+
+                Vector128<byte> in0 = inRef;
+                Vector128<byte> in1 = Unsafe.Add(ref inRef, 1);
+                Vector128<byte> in2 = Unsafe.Add(ref inRef, 2);
+
+                Vector128<byte> r0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2));
+                Vector128<byte> g0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2));
+                Vector128<byte> b0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2));
+
+                in0 = Unsafe.Add(ref inRef, 3);
+                in1 = Unsafe.Add(ref inRef, 4);
+                in2 = Unsafe.Add(ref inRef, 5);
+
+                Vector128<byte> r1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2));
+                Vector128<byte> g1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2));
+                Vector128<byte> b1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2));
+
+                inRef = r0;
+                Unsafe.Add(ref inRef, 1) = r1;
+                Unsafe.Add(ref inRef, 2) = g0;
+                Unsafe.Add(ref inRef, 3) = g1;
+                Unsafe.Add(ref inRef, 4) = b0;
+                Unsafe.Add(ref inRef, 5) = b1;
+            }
+        }
+
+        /// <summary>
+        /// Converts the previously separated (see <see cref="SeparateRgb"/>) RGB values to YCbCr using AVX2 and FMA.
+        /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static void ConvertInternal(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+        {
+            var f0299 = Vector256.Create(0.299f);
+            var f0587 = Vector256.Create(0.587f);
+            var f0114 = Vector256.Create(0.114f);
+            var fn0168736 = Vector256.Create(-0.168736f);
+            var fn0331264 = Vector256.Create(-0.331264f);
+            var f128 = Vector256.Create(128f);
+            var fn0418688 = Vector256.Create(-0.418688f);
+            var fn0081312F = Vector256.Create(-0.081312F);
+            var f05 = Vector256.Create(0.5f);
+
+            ref Vector256<byte> inRef = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
+
+            for (int i = 0; i < 2; i++)
+            {
+                ref Vector256<float> destYRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref yBlock), i * 4);
+                ref Vector256<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock), i * 4);
+                ref Vector256<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock), i * 4);
+
+                Vector256<byte> red = Unsafe.Add(ref inRef, i * 3);
+                Vector256<byte> green = Unsafe.Add(ref inRef, (i * 3) + 1);
+                Vector256<byte> blue = Unsafe.Add(ref inRef, (i * 3) + 2);
+
+                for (int j = 0; j < 2; j++)
+                {
+                    // 1st part of unrolled loop
+                    Vector256<byte> mask = Unsafe.Add(ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractionMasks)), j * 2);
+
+                    Vector256<float> r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32());
+                    Vector256<float> g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32());
+                    Vector256<float> b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32());
+
+                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
+                    Vector256<float> yy0 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b)));
+
+                    // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+                    Vector256<float> cb0 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b))));
+
+                    // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+                    Vector256<float> cr0 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b))));
+
+                    // 2nd part of unrolled loop
+                    mask = Unsafe.Add(ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractionMasks)), (j * 2) + 1);
+
+                    r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32());
+                    g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32());
+                    b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32());
+
+                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
+                    Vector256<float> yy1 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b)));
+
+                    // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+                    Vector256<float> cb1 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b))));
+
+                    // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+                    Vector256<float> cr1 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b))));
+
+                    // store results from 1st and 2nd part
+                    Vector256<float> tmpY = Avx.Permute2x128(yy0, yy1, 0b0010_0001);
+                    Unsafe.Add(ref destYRef, j) = Avx.Blend(yy0, tmpY, 0b1111_0000);
+                    Unsafe.Add(ref destYRef, j + 2) = Avx.Blend(yy1, tmpY, 0b0000_1111);
+
+                    Vector256<float> tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001);
+                    Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000);
+                    Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb0, tmpCb, 0b0000_1111);
+
+                    Vector256<float> tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001);
+                    Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000);
+                    Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr0, tmpCr, 0b0000_1111);
+                }
+            }
+        }
+#endif
+    }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index 4d6186e22f..b658993278 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -2,7 +2,6 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-using System.Runtime.CompilerServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.PixelFormats;
 
@@ -33,7 +32,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// <summary>
         /// The color conversion tables
         /// </summary>
-        private RgbToYCbCrTables colorTables;
+        private RgbToYCbCrConverterLut colorTables;
 
         /// <summary>
         /// Temporal 8x8 block to hold TPixel data
@@ -48,7 +47,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public static YCbCrForwardConverter<TPixel> Create()
         {
             var result = default(YCbCrForwardConverter<TPixel>);
-            result.colorTables = RgbToYCbCrTables.Create();
+            if (RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                // Avoid creating lookup tables, when vectorized converter is supported
+                result.colorTables = RgbToYCbCrConverterLut.Create();
+            }
+
             return result;
         }
 
@@ -65,20 +69,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             ref Block8x8F yBlock = ref this.Y;
             ref Block8x8F cbBlock = ref this.Cb;
             ref Block8x8F crBlock = ref this.Cr;
-            ref Rgb24 rgbStart = ref rgbSpan[0];
 
-            for (int i = 0; i < 64; i++)
+            if (RgbToYCbCrConverterVectorized.IsSupported)
             {
-                ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
-
-                this.colorTables.ConvertPixelInto(
-                    c.R,
-                    c.G,
-                    c.B,
-                    ref yBlock,
-                    ref cbBlock,
-                    ref crBlock,
-                    i);
+                RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+            }
+            else
+            {
+                this.colorTables.Convert(rgbSpan,  ref yBlock, ref cbBlock, ref crBlock);
             }
         }
     }
diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
new file mode 100644
index 0000000000..1db4072932
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
@@ -0,0 +1,56 @@
+﻿// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.Formats.Jpeg.Components;
+using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder
+{
+    public class YCbCrForwardConverterBenchmark
+    {
+        private RgbToYCbCrConverterLut converter;
+        private Rgb24[] data;
+
+        [GlobalSetup]
+        public void Setup()
+        {
+            this.converter = RgbToYCbCrConverterLut.Create();
+
+            var r = new Random(42);
+            this.data = new Rgb24[64];
+
+            var d = new byte[3];
+            for (int i = 0; i < this.data.Length; i++)
+            {
+                r.NextBytes(d);
+                this.data[i] = new Rgb24(d[0], d[1], d[2]);
+            }
+        }
+
+        [Benchmark(Baseline = true)]
+        public void ConvertLut()
+        {
+            Block8x8F y = default;
+            Block8x8F cb = default;
+            Block8x8F cr = default;
+
+            this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
+        }
+
+        [Benchmark]
+        public void ConvertVectorized()
+        {
+            Block8x8F y = default;
+            Block8x8F cb = default;
+            Block8x8F cr = default;
+
+            if (RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
+            }
+        }
+    }
+}

From 429696bd5e0ae1a5a872d8711c305228799726f9 Mon Sep 17 00:00:00 2001
From: Nicolas Portmann <ndportmann@gmail.com>
Date: Sun, 17 Jan 2021 14:55:21 +0100
Subject: [PATCH 02/13] Fix mistakes in final touches

---
 .../Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs  | 4 ++--
 .../Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 068c3db964..ddaa2069ed 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -169,11 +169,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 
                     Vector256<float> tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001);
                     Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000);
-                    Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb0, tmpCb, 0b0000_1111);
+                    Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb1, tmpCb, 0b0000_1111);
 
                     Vector256<float> tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001);
                     Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000);
-                    Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr0, tmpCr, 0b0000_1111);
+                    Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr1, tmpCr, 0b0000_1111);
                 }
             }
         }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index b658993278..8fcc63c6aa 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -47,7 +47,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         public static YCbCrForwardConverter<TPixel> Create()
         {
             var result = default(YCbCrForwardConverter<TPixel>);
-            if (RgbToYCbCrConverterVectorized.IsSupported)
+            if (!RgbToYCbCrConverterVectorized.IsSupported)
             {
                 // Avoid creating lookup tables, when vectorized converter is supported
                 result.colorTables = RgbToYCbCrConverterLut.Create();

From 93099d1585e14706f85ea58682d799d4b446b8e4 Mon Sep 17 00:00:00 2001
From: Nicolas Portmann <ndportmann@gmail.com>
Date: Sun, 17 Jan 2021 22:50:32 +0100
Subject: [PATCH 03/13] Add unit tests for both converters

---
 .../Encoder/RgbToYCbCrConverterLut.cs         |  2 +-
 .../Formats/Jpg/RgbToYCbCrConverterTests.cs   | 98 +++++++++++++++++++
 2 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 835a34f652..3c1a02c5aa 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -111,7 +111,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
             cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
 
-            // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+            // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
             crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
         }
 
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
new file mode 100644
index 0000000000..9134de42e5
--- /dev/null
+++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
@@ -0,0 +1,98 @@
+﻿// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using SixLabors.ImageSharp.ColorSpaces;
+using SixLabors.ImageSharp.Formats.Jpeg.Components;
+using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder;
+using SixLabors.ImageSharp.PixelFormats;
+using SixLabors.ImageSharp.Tests.Colorspaces.Conversion;
+using Xunit;
+using Xunit.Abstractions;
+
+// ReSharper disable InconsistentNaming
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg
+{
+    public class RgbToYCbCrConverterTests
+    {
+        private const float Epsilon = .5F;
+        private static readonly ApproximateColorSpaceComparer Comparer = new ApproximateColorSpaceComparer(Epsilon);
+
+        public RgbToYCbCrConverterTests(ITestOutputHelper output)
+        {
+            this.Output = output;
+        }
+
+        private ITestOutputHelper Output { get; }
+
+        [Fact]
+        public void TestLutConverter()
+        {
+            Rgb24[] data = CreateTestData();
+            var target = RgbToYCbCrConverterLut.Create();
+
+            Block8x8F y = default;
+            Block8x8F cb = default;
+            Block8x8F cr = default;
+
+            target.Convert(data.AsSpan(), ref y, ref cb, ref cr);
+
+            Verify(data, ref y, ref cb, ref cr);
+        }
+
+        [Fact]
+        public void TestVectorizedConverter()
+        {
+            if (!RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                this.Output.WriteLine("No AVX and/or FMA present, skipping test!");
+                return;
+            }
+
+            Rgb24[] data = CreateTestData();
+
+            // RgbToYCbCrConverterVectorized uses `data` as working memory so we need a copy for verification below
+            Rgb24[] dataCopy = new Rgb24[data.Length];
+            data.CopyTo(dataCopy, 0);
+
+            Block8x8F y = default;
+            Block8x8F cb = default;
+            Block8x8F cr = default;
+
+            RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr);
+
+            Verify(dataCopy, ref y, ref cb, ref cr);
+        }
+
+        private static void Verify(ReadOnlySpan<Rgb24> data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult)
+        {
+            for (int i = 0; i < data.Length; i++)
+            {
+                int r = data[i].R;
+                int g = data[i].G;
+                int b = data[i].B;
+
+                float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
+                float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
+                float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
+
+                Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), Comparer);
+            }
+        }
+
+        private static Rgb24[] CreateTestData()
+        {
+            var data = new Rgb24[64];
+            var r = new Random();
+
+            var random = new byte[3];
+            for (int i = 0; i < data.Length; i++)
+            {
+                r.NextBytes(random);
+                data[i] = new Rgb24(random[0], random[1], random[2]);
+            }
+
+            return data;
+        }
+    }
+}

From 08a68af1a997c56a4e6a721cf9de7fdb1cd1f4ce Mon Sep 17 00:00:00 2001
From: Nicolas Portmann <ndportmann@gmail.com>
Date: Sun, 17 Jan 2021 23:01:18 +0100
Subject: [PATCH 04/13] Allow epsilon of 1F for existing LUT converter

---
 .../Formats/Jpg/RgbToYCbCrConverterTests.cs           | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
index 9134de42e5..776cbb44f3 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
@@ -15,9 +15,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 {
     public class RgbToYCbCrConverterTests
     {
-        private const float Epsilon = .5F;
-        private static readonly ApproximateColorSpaceComparer Comparer = new ApproximateColorSpaceComparer(Epsilon);
-
         public RgbToYCbCrConverterTests(ITestOutputHelper output)
         {
             this.Output = output;
@@ -37,7 +34,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
             target.Convert(data.AsSpan(), ref y, ref cb, ref cr);
 
-            Verify(data, ref y, ref cb, ref cr);
+            Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F));
         }
 
         [Fact]
@@ -61,10 +58,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
             RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr);
 
-            Verify(dataCopy, ref y, ref cb, ref cr);
+            Verify(dataCopy, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F));
         }
 
-        private static void Verify(ReadOnlySpan<Rgb24> data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult)
+        private static void Verify(ReadOnlySpan<Rgb24> data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer)
         {
             for (int i = 0; i < data.Length; i++)
             {
@@ -76,7 +73,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
                 float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
 
-                Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), Comparer);
+                Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), comparer);
             }
         }
 

From 5033e3eb950aa15a89a1ccd1f706c629344f9119 Mon Sep 17 00:00:00 2001
From: Nicolas Portmann <ndportmann@gmail.com>
Date: Mon, 18 Jan 2021 12:49:08 +0100
Subject: [PATCH 05/13] Improve algorithm

---
 .../Encoder/RgbToYCbCrConverterVectorized.cs  | 180 ++++++------------
 .../Formats/Jpg/RgbToYCbCrConverterTests.cs   |   8 +-
 2 files changed, 61 insertions(+), 127 deletions(-)

diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index ddaa2069ed..209cc3c6ab 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -15,97 +15,43 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
     internal static class RgbToYCbCrConverterVectorized
     {
-        private static ReadOnlySpan<byte> ExtractionMasks => new byte[]
-        {
-            0x0, 0xFF, 0xFF, 0xFF, 0x1, 0xFF, 0xFF, 0xFF, 0x2, 0xFF, 0xFF, 0xFF, 0x3, 0xFF, 0xFF, 0xFF,   0x10, 0xFF, 0xFF, 0xFF, 0x11, 0xFF, 0xFF, 0xFF, 0x12, 0xFF, 0xFF, 0xFF, 0x13, 0xFF, 0xFF, 0xFF,
-            0x4, 0xFF, 0xFF, 0xFF, 0x5, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,   0x14, 0xFF, 0xFF, 0xFF, 0x15, 0xFF, 0xFF, 0xFF, 0x16, 0xFF, 0xFF, 0xFF, 0x17, 0xFF, 0xFF, 0xFF,
-            0x8, 0xFF, 0xFF, 0xFF, 0x9, 0xFF, 0xFF, 0xFF, 0xA, 0xFF, 0xFF, 0xFF, 0xB, 0xFF, 0xFF, 0xFF,   0x18, 0xFF, 0xFF, 0xFF, 0x19, 0xFF, 0xFF, 0xFF, 0x1A, 0xFF, 0xFF, 0xFF, 0x1B, 0xFF, 0xFF, 0xFF,
-            0xC, 0xFF, 0xFF, 0xFF, 0xD, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,   0x1C, 0xFF, 0xFF, 0xFF, 0x1D, 0xFF, 0xFF, 0xFF, 0x1E, 0xFF, 0xFF, 0xFF, 0x1F, 0xFF, 0xFF, 0xFF,
-        };
-
         public static bool IsSupported
         {
             get
             {
 #if SUPPORTS_RUNTIME_INTRINSICS
-                return Avx2.IsSupported && Fma.IsSupported;
+                return Avx2.IsSupported;
 #else
                 return false;
 #endif
             }
         }
 
-        public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
-        {
-            Debug.Assert(IsSupported, "AVX2 and FMA are required to run this converter");
-
 #if SUPPORTS_RUNTIME_INTRINSICS
-            SeparateRgb(rgbSpan);
-            ConvertInternal(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
-#endif
-        }
-
-#if SUPPORTS_RUNTIME_INTRINSICS
-        /// <summary>
-        /// Rearranges the provided <paramref name="rgbSpan"/> in-place
-        /// from { r00, g00, b00, ..., r63, g63, b63 }
-        /// to { r00, ... r31, g00, ..., g31, b00, ..., b31,
-        ///      r32, ... r63, g32, ..., g63, b31, ..., b63 }
-        /// </summary>
-        /// <remarks>
-        /// SSE is used for this operation as it is significantly faster than AVX in this specific case.
-        /// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers.
-        /// </remarks>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static void SeparateRgb(ReadOnlySpan<Rgb24> rgbSpan)
+        private static ReadOnlySpan<byte> MoveFirst24BytesToSeparateLanes => new byte[]
         {
-            var selectRed0 = Vector128.Create(0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-            var selectRed1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-            var selectRed2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D);
-
-            var selectGreen0 = Vector128.Create(0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-            var selectGreen1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-            var selectGreen2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E);
+            0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0,
+            3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0
+        };
 
-            var selectBlue0 = Vector128.Create(0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-            var selectBlue1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-            var selectBlue2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F);
+        private static ReadOnlySpan<byte> MoveLast24BytesToSeparateLanes => new byte[]
+        {
+            2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,
+            5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0
+        };
 
-            for (int i = 0; i < 2; i++)
-            {
-                ref Vector128<byte> inRef = ref Unsafe.Add(ref Unsafe.As<Rgb24, Vector128<byte>>(ref MemoryMarshal.GetReference(rgbSpan)), i * 6);
-
-                Vector128<byte> in0 = inRef;
-                Vector128<byte> in1 = Unsafe.Add(ref inRef, 1);
-                Vector128<byte> in2 = Unsafe.Add(ref inRef, 2);
-
-                Vector128<byte> r0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2));
-                Vector128<byte> g0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2));
-                Vector128<byte> b0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2));
-
-                in0 = Unsafe.Add(ref inRef, 3);
-                in1 = Unsafe.Add(ref inRef, 4);
-                in2 = Unsafe.Add(ref inRef, 5);
-
-                Vector128<byte> r1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2));
-                Vector128<byte> g1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2));
-                Vector128<byte> b1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2));
-
-                inRef = r0;
-                Unsafe.Add(ref inRef, 1) = r1;
-                Unsafe.Add(ref inRef, 2) = g0;
-                Unsafe.Add(ref inRef, 3) = g1;
-                Unsafe.Add(ref inRef, 4) = b0;
-                Unsafe.Add(ref inRef, 5) = b1;
-            }
-        }
+        private static ReadOnlySpan<byte> ExtractRgb => new byte[]
+        {
+            0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF,
+            0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF
+        };
+#endif
 
-        /// <summary>
-        /// Converts the previously separated (see <see cref="SeparateRgb"/>) RGB values to YCbCr using AVX2 and FMA.
-        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static void ConvertInternal(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+        public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
         {
+            Debug.Assert(IsSupported, "AVX2 is required to run this converter");
+
+#if SUPPORTS_RUNTIME_INTRINSICS
             var f0299 = Vector256.Create(0.299f);
             var f0587 = Vector256.Create(0.587f);
             var f0114 = Vector256.Create(0.114f);
@@ -115,68 +61,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
             var fn0418688 = Vector256.Create(-0.418688f);
             var fn0081312F = Vector256.Create(-0.081312F);
             var f05 = Vector256.Create(0.5f);
+            var zero = Vector256.Create(0).AsByte();
 
             ref Vector256<byte> inRef = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
-
-            for (int i = 0; i < 2; i++)
+            ref Vector256<float> destYRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref yBlock);
+            ref Vector256<float> destCbRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock);
+            ref Vector256<float> destCrRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock);
+
+            var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
+            var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
+            Vector256<byte> rgb, rg, bx;
+            Vector256<float> r, g, b;
+            for (int i = 0; i < 7; i++)
             {
-                ref Vector256<float> destYRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref yBlock), i * 4);
-                ref Vector256<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock), i * 4);
-                ref Vector256<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock), i * 4);
-
-                Vector256<byte> red = Unsafe.Add(ref inRef, i * 3);
-                Vector256<byte> green = Unsafe.Add(ref inRef, (i * 3) + 1);
-                Vector256<byte> blue = Unsafe.Add(ref inRef, (i * 3) + 2);
+                rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte();
 
-                for (int j = 0; j < 2; j++)
-                {
-                    // 1st part of unrolled loop
-                    Vector256<byte> mask = Unsafe.Add(ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractionMasks)), j * 2);
+                rgb = Avx2.Shuffle(rgb, extractRgbMask);
 
-                    Vector256<float> r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32());
-                    Vector256<float> g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32());
-                    Vector256<float> b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32());
+                rg = Avx2.UnpackLow(rgb, zero);
+                bx = Avx2.UnpackHigh(rgb, zero);
 
-                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
-                    Vector256<float> yy0 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b)));
+                r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
+                g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
+                b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
 
-                    // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
-                    Vector256<float> cb0 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b))));
+                // (0.299F * r) + (0.587F * g) + (0.114F * b);
+                Unsafe.Add(ref destYRef, i) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
 
-                    // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
-                    Vector256<float> cr0 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b))));
+                // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+                Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
 
-                    // 2nd part of unrolled loop
-                    mask = Unsafe.Add(ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractionMasks)), (j * 2) + 1);
-
-                    r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32());
-                    g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32());
-                    b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32());
+                // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+                Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
+            }
 
-                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
-                    Vector256<float> yy1 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b)));
+            extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes));
+            rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte();
+            rgb = Avx2.Shuffle(rgb, extractRgbMask);
 
-                    // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
-                    Vector256<float> cb1 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b))));
+            rg = Avx2.UnpackLow(rgb, zero);
+            bx = Avx2.UnpackHigh(rgb, zero);
 
-                    // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
-                    Vector256<float> cr1 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b))));
+            r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
+            g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
+            b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
 
-                    // store results from 1st and 2nd part
-                    Vector256<float> tmpY = Avx.Permute2x128(yy0, yy1, 0b0010_0001);
-                    Unsafe.Add(ref destYRef, j) = Avx.Blend(yy0, tmpY, 0b1111_0000);
-                    Unsafe.Add(ref destYRef, j + 2) = Avx.Blend(yy1, tmpY, 0b0000_1111);
+            // (0.299F * r) + (0.587F * g) + (0.114F * b);
+            Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
 
-                    Vector256<float> tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001);
-                    Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000);
-                    Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb1, tmpCb, 0b0000_1111);
+            // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+            Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
 
-                    Vector256<float> tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001);
-                    Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000);
-                    Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr1, tmpCr, 0b0000_1111);
-                }
-            }
-        }
+            // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+            Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
 #endif
+        }
     }
 }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
index 776cbb44f3..9a6fc8d6fd 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
@@ -48,17 +48,13 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 
             Rgb24[] data = CreateTestData();
 
-            // RgbToYCbCrConverterVectorized uses `data` as working memory so we need a copy for verification below
-            Rgb24[] dataCopy = new Rgb24[data.Length];
-            data.CopyTo(dataCopy, 0);
-
             Block8x8F y = default;
             Block8x8F cb = default;
             Block8x8F cr = default;
 
             RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr);
 
-            Verify(dataCopy, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F));
+            Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F));
         }
 
         private static void Verify(ReadOnlySpan<Rgb24> data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer)
@@ -73,7 +69,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                 float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
                 float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
 
-                Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), comparer);
+                Assert.True(comparer.Equals(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y} == {yResult[i]}, {cb} == {cbResult[i]}, {cr} == {crResult[i]}");
             }
         }
 

From 1033297a37519b56729b7a5ba54259ba1fcb4de4 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 18:19:31 +0100
Subject: [PATCH 06/13] Add initial FMA resize kernel convolve implementation

---
 .../Transforms/Resize/ResizeKernel.cs         | 58 +++++++++++++++----
 1 file changed, 48 insertions(+), 10 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index d94aeffe69..bff2c574a6 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -4,6 +4,10 @@
 using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 namespace SixLabors.ImageSharp.Processing.Processors.Transforms
 {
@@ -66,21 +70,55 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
         [MethodImpl(InliningOptions.ShortMethod)]
         public Vector4 ConvolveCore(ref Vector4 rowStartRef)
         {
-            ref float horizontalValues = ref Unsafe.AsRef<float>(this.bufferPtr);
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Fma.IsSupported)
+            {
+                float* bufferStart = this.bufferPtr;
+                float* bufferEnd = bufferStart + (this.Length & ~1);
+                Vector256<float> result256 = Vector256<float>.Zero;
 
-            // Destination color components
-            Vector4 result = Vector4.Zero;
+                while (bufferStart < bufferEnd)
+                {
+                    Vector256<float> rowItem256 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                    var bufferItem256 = Vector256.Create(Vector128.Create(bufferStart[0]), Vector128.Create(bufferStart[1]));
 
-            for (int i = 0; i < this.Length; i++)
-            {
-                float weight = Unsafe.Add(ref horizontalValues, i);
+                    result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256);
+
+                    bufferStart += 2;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
+                }
+
+                Vector128<float> result128 = Sse.Add(result256.GetLower(), result256.GetUpper());
+
+                if ((this.Length & 1) != 0)
+                {
+                    Vector128<float> rowItem128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
+                    var bufferItem128 = Vector128.Create(*bufferStart);
 
-                // Vector4 v = offsetedRowSpan[i];
-                Vector4 v = Unsafe.Add(ref rowStartRef, i);
-                result += v * weight;
+                    result128 = Fma.MultiplyAdd(rowItem128, bufferItem128, result128);
+                }
+
+                return *(Vector4*)&result128;
             }
+            else
+#endif
+            {
+                // Destination color components
+                Vector4 result = Vector4.Zero;
+                float* bufferStart = this.bufferPtr;
+                float* bufferEnd = this.bufferPtr + this.Length;
+
+                while (bufferStart < bufferEnd)
+                {
+                    // Vector4 v = offsetedRowSpan[i];
+                    result += rowStartRef * *bufferStart;
 
-            return result;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
+                    bufferStart++;
+                }
+
+                return result;
+            }
         }
 
         /// <summary>

From c825eccd10f14eb733cdbe4c75656005afae5aed Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 19:14:51 +0100
Subject: [PATCH 07/13] Improved loading of factors using permutation

Assembly for loading in the loop went from:
```asm
vmovss xmm2, [rax]
vbroadcastss xmm2, xmm2
vmovss xmm3, [rax+4]
vbroadcastss xmm3, xmm3
vinsertf128 ymm2, ymm2, xmm3, 1
```
To:
```asm
vmovsd xmm3, [rax]
vbroadcastsd ymm3, xmm3
vpermps ymm3, ymm1, ymm3
```
---
 .../Processing/Processors/Transforms/Resize/ResizeKernel.cs    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index bff2c574a6..02027f42d8 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -76,11 +76,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                 float* bufferStart = this.bufferPtr;
                 float* bufferEnd = bufferStart + (this.Length & ~1);
                 Vector256<float> result256 = Vector256<float>.Zero;
+                var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1);
 
                 while (bufferStart < bufferEnd)
                 {
                     Vector256<float> rowItem256 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
-                    var bufferItem256 = Vector256.Create(Vector128.Create(bufferStart[0]), Vector128.Create(bufferStart[1]));
+                    Vector256<float> bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask);
 
                     result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256);
 

From 1169e73915d98590e82d64f72fa3c2197e00aea9 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 19:33:15 +0100
Subject: [PATCH 08/13] Switch from FMA to AVX2 instructions

---
 .../Processors/Transforms/Resize/ResizeKernel.cs       | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index 02027f42d8..5a87d045ea 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
         public Vector4 ConvolveCore(ref Vector4 rowStartRef)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
-            if (Fma.IsSupported)
+            if (Avx2.IsSupported)
             {
                 float* bufferStart = this.bufferPtr;
                 float* bufferEnd = bufferStart + (this.Length & ~1);
@@ -82,8 +82,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                 {
                     Vector256<float> rowItem256 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
                     Vector256<float> bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask);
+                    Vector256<float> multiply256 = Avx.Multiply(rowItem256, bufferItem256);
 
-                    result256 = Fma.MultiplyAdd(rowItem256, bufferItem256, result256);
+                    result256 = Avx.Add(multiply256, result256);
 
                     bufferStart += 2;
                     rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
@@ -95,8 +96,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                 {
                     Vector128<float> rowItem128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
                     var bufferItem128 = Vector128.Create(*bufferStart);
+                    Vector128<float> multiply128 = Sse.Multiply(rowItem128, bufferItem128);
 
-                    result128 = Fma.MultiplyAdd(rowItem128, bufferItem128, result128);
+                    result128 = Sse.Add(multiply128, result128);
                 }
 
                 return *(Vector4*)&result128;
@@ -114,8 +116,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                     // Vector4 v = offsetedRowSpan[i];
                     result += rowStartRef * *bufferStart;
 
-                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
                     bufferStart++;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
                 }
 
                 return result;

From 0e465cd8c30713b1c3c91966ebef855d4eda314d Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 22:58:43 +0100
Subject: [PATCH 09/13] Revert to FMA, codegen improvements

---
 .../Transforms/Resize/ResizeKernel.cs         | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index 5a87d045ea..bd22864bb2 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
         public Vector4 ConvolveCore(ref Vector4 rowStartRef)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
+            if (Fma.IsSupported)
             {
                 float* bufferStart = this.bufferPtr;
                 float* bufferEnd = bufferStart + (this.Length & ~1);
@@ -80,11 +80,20 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
 
                 while (bufferStart < bufferEnd)
                 {
-                    Vector256<float> rowItem256 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
-                    Vector256<float> bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask);
-                    Vector256<float> multiply256 = Avx.Multiply(rowItem256, bufferItem256);
-
-                    result256 = Avx.Add(multiply256, result256);
+                    // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
+                    // for the FMA operation, and execute it directly on the target register and reading directly from
+                    // memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
+                    // The code below should compile in the following assembly on .NET 5 x64:
+                    //
+                    // vmovsd xmm2, [rax]               ; load *(double*)bufferStart into xmm2 as [ab, _]
+                    // vpermps ymm2, ymm1, ymm2         ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
+                    // vfmadd231ps ymm0, ymm2, [r8]     ; result256 = FMA(pixels, factors) + result256
+                    //
+                    // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
+                    result256 = Fma.MultiplyAdd(
+                        Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
+                        Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
+                        result256);
 
                     bufferStart += 2;
                     rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
@@ -94,11 +103,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
 
                 if ((this.Length & 1) != 0)
                 {
-                    Vector128<float> rowItem128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
-                    var bufferItem128 = Vector128.Create(*bufferStart);
-                    Vector128<float> multiply128 = Sse.Multiply(rowItem128, bufferItem128);
-
-                    result128 = Sse.Add(multiply128, result128);
+                    result128 = Fma.MultiplyAdd(
+                        Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef),
+                        Vector128.Create(*bufferStart),
+                        result128);
                 }
 
                 return *(Vector4*)&result128;

From e0b2defde22343414ee70babe21d1209fb760cbe Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 23:25:47 +0100
Subject: [PATCH 10/13] Add unrolled FMA loop

---
 .../Transforms/Resize/ResizeKernel.cs         | 34 ++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index bd22864bb2..b537cdfdf9 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -74,8 +74,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
             if (Fma.IsSupported)
             {
                 float* bufferStart = this.bufferPtr;
-                float* bufferEnd = bufferStart + (this.Length & ~1);
-                Vector256<float> result256 = Vector256<float>.Zero;
+                float* bufferEnd = bufferStart + (this.Length & ~3);
+                Vector256<float> result256_0 = Vector256<float>.Zero;
+                Vector256<float> result256_1 = Vector256<float>.Zero;
                 var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1);
 
                 while (bufferStart < bufferEnd)
@@ -87,19 +88,36 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                     //
                     // vmovsd xmm2, [rax]               ; load *(double*)bufferStart into xmm2 as [ab, _]
                     // vpermps ymm2, ymm1, ymm2         ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
-                    // vfmadd231ps ymm0, ymm2, [r8]     ; result256 = FMA(pixels, factors) + result256
+                    // vfmadd231ps ymm0, ymm2, [r8]     ; result256_0 = FMA(pixels, factors) + result256_0
                     //
                     // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
-                    result256 = Fma.MultiplyAdd(
+                    // Additionally, we're also unrolling two computations per each loop iterations to leverage the
+                    // fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
+                    result256_0 = Fma.MultiplyAdd(
                         Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
                         Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
-                        result256);
+                        result256_0);
 
-                    bufferStart += 2;
-                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
+                    result256_1 = Fma.MultiplyAdd(
+                        Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)),
+                        Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask),
+                        result256_1);
+
+                    bufferStart += 4;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 4);
+                }
+
+                result256_0 = Avx.Add(result256_0, result256_1);
+
+                if ((this.Length & 3) >= 2)
+                {
+                    result256_0 = Fma.MultiplyAdd(
+                        Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
+                        Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
+                        result256_0);
                 }
 
-                Vector128<float> result128 = Sse.Add(result256.GetLower(), result256.GetUpper());
+                Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());
 
                 if ((this.Length & 1) != 0)
                 {

From e68a21de52d1de7c9eaeb234ab50ec4cf470c2fc Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Tue, 19 Jan 2021 23:31:32 +0100
Subject: [PATCH 11/13] Add missing indexing update

---
 .../Processing/Processors/Transforms/Resize/ResizeKernel.cs    | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index b537cdfdf9..c79f938d73 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -115,6 +115,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                         Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
                         Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
                         result256_0);
+
+                    bufferStart += 2;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
                 }
 
                 Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());

From ed4cfaa0ae4165357db4778da198189b8bc7d003 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Wed, 20 Jan 2021 17:39:12 +0100
Subject: [PATCH 12/13] Workaround for incorrect codegen on .NET 5

See Vector256.Create issue: https://github.com/dotnet/runtime/issues/47236
---
 .../Processors/Transforms/Resize/ResizeKernel.cs       | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
index c79f938d73..979206ad5c 100644
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -5,6 +5,7 @@ using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 #endif
@@ -77,7 +78,14 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
                 float* bufferEnd = bufferStart + (this.Length & ~3);
                 Vector256<float> result256_0 = Vector256<float>.Zero;
                 Vector256<float> result256_1 = Vector256<float>.Zero;
-                var mask = Vector256.Create(0, 0, 0, 0, 1, 1, 1, 1);
+                ReadOnlySpan<byte> maskBytes = new byte[]
+                {
+                    0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0,
+                    1, 0, 0, 0, 1, 0, 0, 0,
+                    1, 0, 0, 0, 1, 0, 0, 0,
+                };
+                Vector256<int> mask = Unsafe.ReadUnaligned<Vector256<int>>(ref MemoryMarshal.GetReference(maskBytes));
 
                 while (bufferStart < bufferEnd)
                 {

From 8c7019e41e9a9dfbba63af19859194471a08be3a Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Wed, 20 Jan 2021 21:50:35 +0100
Subject: [PATCH 13/13] Update image threshold for resize tests

---
 .../Processing/Processors/Transforms/ResizeTests.cs             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs
index f4a94782fd..58b7fd12e8 100644
--- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs
@@ -139,7 +139,7 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Transforms
                         testOutputDetails: workingBufferLimitInRows,
                         appendPixelTypeToFileName: false);
                     image.CompareToReferenceOutput(
-                        ImageComparer.TolerantPercentage(0.001f),
+                        ImageComparer.TolerantPercentage(0.004f),
                         provider,
                         testOutputDetails: workingBufferLimitInRows,
                         appendPixelTypeToFileName: false);