Merge remote-tracking branch 'upstream/master' into bigtiff

5 years ago · ff72d69a48
93 changed files with 1839 additions and 848 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 33cb12ca77f919b44de56f344d2627cc2a108c3a
+Subproject commit a042aba176cdb840d800c6ed4cfe41a54fb7b1e3
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@ -820,6 +820,26 @@ namespace SixLabors.ImageSharp
            }
        }

+        /// <summary>
+        /// Reduces elements of the vector into one sum.
+        /// </summary>
+        /// <param name="accumulator">The accumulator to reduce.</param>
+        /// <returns>The sum of all elements.</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int ReduceSum(Vector256<int> accumulator)
+        {
+            // Add upper lane to lower lane.
+            Vector128<int> vsum = Sse2.Add(accumulator.GetLower(), accumulator.GetUpper());
+
+            // Add odd to even.
+            vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_11_01_01));
+
+            // Add high to low.
+            vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10));
+
+            return Sse2.ConvertToInt32(vsum);
+        }
+
        /// <summary>
        /// Reduces even elements of the vector into one sum.
        /// </summary>
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@ -337,6 +337,64 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            }
        }

+        /// <summary>
+        /// Transpose the block inplace.
+        /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public void TransposeInplace()
+        {
+            ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);
+
+            // row #0
+            Swap(ref Unsafe.Add(ref elemRef, 1), ref Unsafe.Add(ref elemRef, 8));
+            Swap(ref Unsafe.Add(ref elemRef, 2), ref Unsafe.Add(ref elemRef, 16));
+            Swap(ref Unsafe.Add(ref elemRef, 3), ref Unsafe.Add(ref elemRef, 24));
+            Swap(ref Unsafe.Add(ref elemRef, 4), ref Unsafe.Add(ref elemRef, 32));
+            Swap(ref Unsafe.Add(ref elemRef, 5), ref Unsafe.Add(ref elemRef, 40));
+            Swap(ref Unsafe.Add(ref elemRef, 6), ref Unsafe.Add(ref elemRef, 48));
+            Swap(ref Unsafe.Add(ref elemRef, 7), ref Unsafe.Add(ref elemRef, 56));
+
+            // row #1
+            Swap(ref Unsafe.Add(ref elemRef, 10), ref Unsafe.Add(ref elemRef, 17));
+            Swap(ref Unsafe.Add(ref elemRef, 11), ref Unsafe.Add(ref elemRef, 25));
+            Swap(ref Unsafe.Add(ref elemRef, 12), ref Unsafe.Add(ref elemRef, 33));
+            Swap(ref Unsafe.Add(ref elemRef, 13), ref Unsafe.Add(ref elemRef, 41));
+            Swap(ref Unsafe.Add(ref elemRef, 14), ref Unsafe.Add(ref elemRef, 49));
+            Swap(ref Unsafe.Add(ref elemRef, 15), ref Unsafe.Add(ref elemRef, 57));
+
+            // row #2
+            Swap(ref Unsafe.Add(ref elemRef, 19), ref Unsafe.Add(ref elemRef, 26));
+            Swap(ref Unsafe.Add(ref elemRef, 20), ref Unsafe.Add(ref elemRef, 34));
+            Swap(ref Unsafe.Add(ref elemRef, 21), ref Unsafe.Add(ref elemRef, 42));
+            Swap(ref Unsafe.Add(ref elemRef, 22), ref Unsafe.Add(ref elemRef, 50));
+            Swap(ref Unsafe.Add(ref elemRef, 23), ref Unsafe.Add(ref elemRef, 58));
+
+            // row #3
+            Swap(ref Unsafe.Add(ref elemRef, 28), ref Unsafe.Add(ref elemRef, 35));
+            Swap(ref Unsafe.Add(ref elemRef, 29), ref Unsafe.Add(ref elemRef, 43));
+            Swap(ref Unsafe.Add(ref elemRef, 30), ref Unsafe.Add(ref elemRef, 51));
+            Swap(ref Unsafe.Add(ref elemRef, 31), ref Unsafe.Add(ref elemRef, 59));
+
+            // row #4
+            Swap(ref Unsafe.Add(ref elemRef, 37), ref Unsafe.Add(ref elemRef, 44));
+            Swap(ref Unsafe.Add(ref elemRef, 38), ref Unsafe.Add(ref elemRef, 52));
+            Swap(ref Unsafe.Add(ref elemRef, 39), ref Unsafe.Add(ref elemRef, 60));
+
+            // row #5
+            Swap(ref Unsafe.Add(ref elemRef, 46), ref Unsafe.Add(ref elemRef, 53));
+            Swap(ref Unsafe.Add(ref elemRef, 47), ref Unsafe.Add(ref elemRef, 61));
+
+            // row #6
+            Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62));
+
+            static void Swap(ref short a, ref short b)
+            {
+                short tmp = a;
+                a = b;
+                b = tmp;
+            }
+        }
+
        /// <summary>
        /// Calculate the total sum of absolute differences of elements in 'a' and 'b'.
        /// </summary>
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
@ -151,6 +151,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            if (this.componentsCount == this.frame.ComponentCount)
            {
                this.ParseBaselineDataInterleaved();
+                this.spectralConverter.CommitConversion();
            }
            else
            {
@ -501,7 +502,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                {
                    i += r;
                    s = buffer.Receive(s);
-                    Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s;
+                    Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[i++]) = (short)s;
                }
                else
                {
@ -570,7 +571,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                    if (s != 0)
                    {
                        s = buffer.Receive(s);
-                        Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low);
+                        Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[i]) = (short)(s << low);
                    }
                    else
                    {
@ -646,7 +647,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder

                    do
                    {
-                        ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
+                        ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]);
                        if (coef != 0)
                        {
                            buffer.CheckBits();
@ -672,7 +673,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder

                    if ((s != 0) && (k < 64))
                    {
-                        Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s;
+                        Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]) = (short)s;
                    }
                }
            }
@ -681,7 +682,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            {
                for (; k <= end; k++)
                {
-                    ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
+                    ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]);

                    if (coef != 0)
                    {
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@ -18,11 +18,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        /// </summary>
        public Block8x8F SourceBlock;

-        /// <summary>
-        /// Temporal block to store intermediate computation results.
-        /// </summary>
-        public Block8x8F WorkspaceBlock;
-
        /// <summary>
        /// The quantization table as <see cref="Block8x8F"/>.
        /// </summary>
@ -45,7 +40,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            this.subSamplingDivisors = component.SubSamplingDivisors;

            this.SourceBlock = default;
-            this.WorkspaceBlock = default;
        }

        /// <summary>
@ -71,7 +65,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            // Dequantize:
            block.MultiplyInPlace(ref this.DequantiazationTable);

-            FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock);
+            FastFloatingPointDCT.TransformIDCT(ref block);

            // To conform better to libjpeg we actually NEED TO loose precision here.
            // This is because they store blocks as Int16 between all the operations.
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
@ -13,6 +13,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
    /// </remarks>
    internal abstract class SpectralConverter
    {
+        /// <summary>
+        /// Gets a value indicating whether this converter has converted spectral
+        /// data of the current image or not.
+        /// </summary>
+        protected bool Converted { get; private set; }
+
        /// <summary>
        /// Injects jpeg image decoding metadata.
        /// </summary>
@ -33,6 +39,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        /// </remarks>
        public abstract void ConvertStrideBaseline();

+        /// <summary>
+        /// Marks current converter state as 'converted'.
+        /// </summary>
+        /// <remarks>
+        /// This must be called only for baseline interleaved jpeg's.
+        /// </remarks>
+        public void CommitConversion()
+        {
+            DebugGuard.IsFalse(this.Converted, nameof(this.Converted), $"{nameof(this.CommitConversion)} must be called only once");
+
+            this.Converted = true;
+        }
+
        /// <summary>
        /// Gets the color converter.
        /// </summary>
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs
@ -3,6 +3,7 @@

 using System;
 using System.Buffers;
+using System.Linq;
 using System.Numerics;
 using System.Threading;
 using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters;
@ -29,8 +30,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder

        private Buffer2D<TPixel> pixelBuffer;

-        private int blockRowsPerStep;
-
        private int pixelRowsPerStep;

        private int pixelRowCounter;
@ -41,8 +40,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            this.cancellationToken = cancellationToken;
        }

-        private bool Converted => this.pixelRowCounter >= this.pixelBuffer.Height;
-
        public Buffer2D<TPixel> GetPixelBuffer()
        {
            if (!this.Converted)
@ -52,7 +49,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                for (int step = 0; step < steps; step++)
                {
                    this.cancellationToken.ThrowIfCancellationRequested();
-                    this.ConvertNextStride(step);
+                    this.ConvertStride(step);
                }
            }

@ -65,18 +62,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            MemoryAllocator allocator = this.configuration.MemoryAllocator;

            // iteration data
-            IJpegComponent c0 = frame.Components[0];
+            int majorBlockWidth = frame.Components.Max((component) => component.SizeInBlocks.Width);
+            int majorVerticalSamplingFactor = frame.Components.Max((component) => component.SamplingFactors.Height);

            const int blockPixelHeight = 8;
-            this.blockRowsPerStep = c0.SamplingFactors.Height;
-            this.pixelRowsPerStep = this.blockRowsPerStep * blockPixelHeight;
+            this.pixelRowsPerStep = majorVerticalSamplingFactor * blockPixelHeight;

            // pixel buffer for resulting image
            this.pixelBuffer = allocator.Allocate2D<TPixel>(frame.PixelWidth, frame.PixelHeight);
            this.paddedProxyPixelRow = allocator.Allocate<TPixel>(frame.PixelWidth + 3);

            // component processors from spectral to Rgba32
-            var postProcessorBufferSize = new Size(c0.SizeInBlocks.Width * 8, this.pixelRowsPerStep);
+            const int blockPixelWidth = 8;
+            var postProcessorBufferSize = new Size(majorBlockWidth * blockPixelWidth, this.pixelRowsPerStep);
            this.componentProcessors = new JpegComponentPostProcessor[frame.Components.Length];
            for (int i = 0; i < this.componentProcessors.Length; i++)
            {
@ -84,7 +82,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            }

            // single 'stride' rgba32 buffer for conversion between spectral and TPixel
-            // this.rgbaBuffer = allocator.Allocate<Vector4>(frame.PixelWidth);
            this.rgbBuffer = allocator.Allocate<byte>(frame.PixelWidth * 3);

            // color converter from Rgba32 to TPixel
@ -95,18 +92,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        public override void ConvertStrideBaseline()
        {
            // Convert next pixel stride using single spectral `stride'
-            // Note that zero passing eliminates the need of virtual call from JpegComponentPostProcessor
-            this.ConvertNextStride(spectralStep: 0);
+            // Note that zero passing eliminates the need of virtual call
+            // from JpegComponentPostProcessor
+            this.ConvertStride(spectralStep: 0);

-            // Clear spectral stride - this is VERY important as jpeg possibly won't fill entire buffer each stride
-            // Which leads to decoding artifacts
-            // Note that this code clears all buffers of the post processors, it's their responsibility to allocate only single stride
            foreach (JpegComponentPostProcessor cpp in this.componentProcessors)
            {
                cpp.ClearSpectralBuffers();
            }
        }

+        /// <inheritdoc/>
        public void Dispose()
        {
            if (this.componentProcessors != null)
@ -121,7 +117,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            this.paddedProxyPixelRow?.Dispose();
        }

-        private void ConvertNextStride(int spectralStep)
+        private void ConvertStride(int spectralStep)
        {
            int maxY = Math.Min(this.pixelBuffer.Height, this.pixelRowCounter + this.pixelRowsPerStep);

--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@ -2,9 +2,6 @@
 // Licensed under the Apache License, Version 2.0.

 #if SUPPORTS_RUNTIME_INTRINSICS
-using System.Diagnostics;
-using System.Numerics;
-using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;

@ -12,149 +9,147 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
    internal static partial class FastFloatingPointDCT
    {
-#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rule violation warnings
        private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
        private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f);
        private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
        private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);

-        private static readonly Vector256<float> mm256_F_1_1758 = Vector256.Create(1.175876f);
-        private static readonly Vector256<float> mm256_F_n1_9615 = Vector256.Create(-1.961570560f);
-        private static readonly Vector256<float> mm256_F_n0_3901 = Vector256.Create(-0.390180644f);
-        private static readonly Vector256<float> mm256_F_n0_8999 = Vector256.Create(-0.899976223f);
-        private static readonly Vector256<float> mm256_F_n2_5629 = Vector256.Create(-2.562915447f);
-        private static readonly Vector256<float> mm256_F_0_2986 = Vector256.Create(0.298631336f);
-        private static readonly Vector256<float> mm256_F_2_0531 = Vector256.Create(2.053119869f);
-        private static readonly Vector256<float> mm256_F_3_0727 = Vector256.Create(3.072711026f);
-        private static readonly Vector256<float> mm256_F_1_5013 = Vector256.Create(1.501321110f);
-        private static readonly Vector256<float> mm256_F_n1_8477 = Vector256.Create(-1.847759065f);
-        private static readonly Vector256<float> mm256_F_0_7653 = Vector256.Create(0.765366865f);
+        private static readonly Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
+        private static readonly Vector256<float> mm256_F_1_8477 = Vector256.Create(1.847759065f);
+        private static readonly Vector256<float> mm256_F_n1_0823 = Vector256.Create(-1.082392200f);
+        private static readonly Vector256<float> mm256_F_n2_6131 = Vector256.Create(-2.613125930f);
 #pragma warning restore SA1310, SA1311, IDE1006

        /// <summary>
        /// Apply floating point FDCT inplace using simd operations.
        /// </summary>
-        /// <param name="block">Input matrix.</param>
-        private static void ForwardTransform_Avx(ref Block8x8F block)
+        /// <param name="block">Input block.</param>
+        private static void FDCT8x8_Avx(ref Block8x8F block)
        {
            DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");

            // First pass - process rows
            block.TransposeInplace();
-            FDCT8x8_Avx(ref block);
+            FDCT8x8_1D_Avx(ref block);

            // Second pass - process columns
            block.TransposeInplace();
-            FDCT8x8_Avx(ref block);
+            FDCT8x8_1D_Avx(ref block);
+
+            // Applies 1D floating point FDCT inplace
+            static void FDCT8x8_1D_Avx(ref Block8x8F block)
+            {
+                Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
+                Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
+                Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
+                Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
+                Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
+                Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
+                Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
+                Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
+
+                // Even part
+                Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
+                Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
+                Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
+                Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
+
+                block.V0 = Avx.Add(tmp10, tmp11);
+                block.V4 = Avx.Subtract(tmp10, tmp11);
+
+                Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
+                block.V2 = Avx.Add(tmp13, z1);
+                block.V6 = Avx.Subtract(tmp13, z1);
+
+                // Odd part
+                tmp10 = Avx.Add(tmp4, tmp5);
+                tmp11 = Avx.Add(tmp5, tmp6);
+                tmp12 = Avx.Add(tmp6, tmp7);
+
+                Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
+                Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
+                Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
+                Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
+
+                Vector256<float> z11 = Avx.Add(tmp7, z3);
+                Vector256<float> z13 = Avx.Subtract(tmp7, z3);
+
+                block.V5 = Avx.Add(z13, z2);
+                block.V3 = Avx.Subtract(z13, z2);
+                block.V1 = Avx.Add(z11, z4);
+                block.V7 = Avx.Subtract(z11, z4);
+            }
        }

        /// <summary>
-        /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix.
+        /// Apply floating point IDCT inplace using simd operations.
        /// </summary>
-        /// <remarks>
-        /// Requires Avx support.
-        /// </remarks>
-        /// <param name="block">Input matrix.</param>
-        public static void FDCT8x8_Avx(ref Block8x8F block)
+        /// <param name="transposedBlock">Transposed input block.</param>
+        private static void IDCT8x8_Avx(ref Block8x8F transposedBlock)
        {
            DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");

-            Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
-            Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
-            Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
-            Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
-            Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
-            Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
-            Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
-            Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
-
-            // Even part
-            Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
-            Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
-            Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
-            Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
-
-            block.V0 = Avx.Add(tmp10, tmp11);
-            block.V4 = Avx.Subtract(tmp10, tmp11);
-
-            Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
-            block.V2 = Avx.Add(tmp13, z1);
-            block.V6 = Avx.Subtract(tmp13, z1);
-
-            // Odd part
-            tmp10 = Avx.Add(tmp4, tmp5);
-            tmp11 = Avx.Add(tmp5, tmp6);
-            tmp12 = Avx.Add(tmp6, tmp7);
-
-            Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
-            Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
-            Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
-            Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
-
-            Vector256<float> z11 = Avx.Add(tmp7, z3);
-            Vector256<float> z13 = Avx.Subtract(tmp7, z3);
-
-            block.V5 = Avx.Add(z13, z2);
-            block.V3 = Avx.Subtract(z13, z2);
-            block.V1 = Avx.Add(z11, z4);
-            block.V7 = Avx.Subtract(z11, z4);
-        }
-
-        /// <summary>
-        /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
-        /// using AVX commands.
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
-        {
-            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
-            Vector256<float> my1 = s.V1;
-            Vector256<float> my7 = s.V7;
-            Vector256<float> mz0 = Avx.Add(my1, my7);
-
-            Vector256<float> my3 = s.V3;
-            Vector256<float> mz2 = Avx.Add(my3, my7);
-            Vector256<float> my5 = s.V5;
-            Vector256<float> mz1 = Avx.Add(my3, my5);
-            Vector256<float> mz3 = Avx.Add(my1, my5);
-
-            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758);
-
-            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615);
-            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901);
-            mz0 = Avx.Multiply(mz0, mm256_F_n0_8999);
-            mz1 = Avx.Multiply(mz1, mm256_F_n2_5629);
-
-            Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2);
-            Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3);
-            Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2);
-            Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3);
-
-            Vector256<float> my2 = s.V2;
-            Vector256<float> my6 = s.V6;
-            mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411);
-            Vector256<float> my0 = s.V0;
-            Vector256<float> my4 = s.V4;
-            mz0 = Avx.Add(my0, my4);
-            mz1 = Avx.Subtract(my0, my4);
-            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477);
-            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653);
-
-            my0 = Avx.Add(mz0, mz3);
-            my3 = Avx.Subtract(mz0, mz3);
-            my1 = Avx.Add(mz1, mz2);
-            my2 = Avx.Subtract(mz1, mz2);
-
-            d.V0 = Avx.Add(my0, mb0);
-            d.V7 = Avx.Subtract(my0, mb0);
-            d.V1 = Avx.Add(my1, mb1);
-            d.V6 = Avx.Subtract(my1, mb1);
-            d.V2 = Avx.Add(my2, mb2);
-            d.V5 = Avx.Subtract(my2, mb2);
-            d.V3 = Avx.Add(my3, mb3);
-            d.V4 = Avx.Subtract(my3, mb3);
+            // First pass - process columns
+            IDCT8x8_1D_Avx(ref transposedBlock);
+
+            // Second pass - process rows
+            transposedBlock.TransposeInplace();
+            IDCT8x8_1D_Avx(ref transposedBlock);
+
+            // Applies 1D floating point FDCT inplace
+            static void IDCT8x8_1D_Avx(ref Block8x8F block)
+            {
+                // Even part
+                Vector256<float> tmp0 = block.V0;
+                Vector256<float> tmp1 = block.V2;
+                Vector256<float> tmp2 = block.V4;
+                Vector256<float> tmp3 = block.V6;
+
+                Vector256<float> z5 = tmp0;
+                Vector256<float> tmp10 = Avx.Add(z5, tmp2);
+                Vector256<float> tmp11 = Avx.Subtract(z5, tmp2);
+
+                Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
+                Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
+
+                tmp0 = Avx.Add(tmp10, tmp13);
+                tmp3 = Avx.Subtract(tmp10, tmp13);
+                tmp1 = Avx.Add(tmp11, tmp12);
+                tmp2 = Avx.Subtract(tmp11, tmp12);
+
+                // Odd part
+                Vector256<float> tmp4 = block.V1;
+                Vector256<float> tmp5 = block.V3;
+                Vector256<float> tmp6 = block.V5;
+                Vector256<float> tmp7 = block.V7;
+
+                Vector256<float> z13 = Avx.Add(tmp6, tmp5);
+                Vector256<float> z10 = Avx.Subtract(tmp6, tmp5);
+                Vector256<float> z11 = Avx.Add(tmp4, tmp7);
+                Vector256<float> z12 = Avx.Subtract(tmp4, tmp7);
+
+                tmp7 = Avx.Add(z11, z13);
+                tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);
+
+                z5 = Avx.Multiply(Avx.Add(z10, z12), mm256_F_1_8477);
+
+                tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, mm256_F_n1_0823);
+                tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, mm256_F_n2_6131);
+
+                tmp6 = Avx.Subtract(tmp12, tmp7);
+                tmp5 = Avx.Subtract(tmp11, tmp6);
+                tmp4 = Avx.Subtract(tmp10, tmp5);
+
+                block.V0 = Avx.Add(tmp0, tmp7);
+                block.V7 = Avx.Subtract(tmp0, tmp7);
+                block.V1 = Avx.Add(tmp1, tmp6);
+                block.V6 = Avx.Subtract(tmp1, tmp6);
+                block.V2 = Avx.Add(tmp2, tmp5);
+                block.V5 = Avx.Subtract(tmp2, tmp5);
+                block.V3 = Avx.Add(tmp3, tmp4);
+                block.V4 = Avx.Subtract(tmp3, tmp4);
+            }
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@ -3,6 +3,7 @@

 using System.Numerics;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
 using System.Runtime.Intrinsics.X86;
 #endif
@ -15,102 +16,202 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
    /// </summary>
    internal static partial class FastFloatingPointDCT
    {
-#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
-        private const float C_1_175876 = 1.175875602f;
-        private const float C_1_961571 = -1.961570560f;
-        private const float C_0_390181 = -0.390180644f;
-        private const float C_0_899976 = -0.899976223f;
-        private const float C_2_562915 = -2.562915447f;
-        private const float C_0_298631 = 0.298631336f;
-        private const float C_2_053120 = 2.053119869f;
-        private const float C_3_072711 = 3.072711026f;
-        private const float C_1_501321 = 1.501321110f;
-        private const float C_0_541196 = 0.541196100f;
-        private const float C_1_847759 = -1.847759065f;
-        private const float C_0_765367 = 0.765366865f;
-
-        private const float C_0_125 = 0.1250f;
-
-#pragma warning disable SA1311, IDE1006 // naming rules violation warnings
-        private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f);
-        private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f);
-        private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f);
-        private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f);
-#pragma warning restore SA1311, IDE1006
-
-#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+        private static readonly Vector4 mm128_F_0_7071 = new(0.707106781f);
+        private static readonly Vector4 mm128_F_0_3826 = new(0.382683433f);
+        private static readonly Vector4 mm128_F_0_5411 = new(0.541196100f);
+        private static readonly Vector4 mm128_F_1_3065 = new(1.306562965f);
+
+        private static readonly Vector4 mm128_F_1_4142 = new(1.414213562f);
+        private static readonly Vector4 mm128_F_1_8477 = new(1.847759065f);
+        private static readonly Vector4 mm128_F_n1_0823 = new(-1.082392200f);
+        private static readonly Vector4 mm128_F_n2_6131 = new(-2.613125930f);
+#pragma warning restore SA1310, SA1311, IDE1006

        /// <summary>
-        /// Gets reciprocal coefficients for jpeg quantization tables calculation.
+        /// Gets adjustment table for quantization tables.
        /// </summary>
        /// <remarks>
        /// <para>
-        /// Current FDCT implementation expects its results to be multiplied by
-        /// a reciprocal quantization table. To get 8x8 reciprocal block values in this
-        /// table must be divided by quantization table values scaled with quality settings.
+        /// Current IDCT and FDCT implementations are based on  Arai, Agui,
+        /// and Nakajima's algorithm. Both DCT methods does not
+        /// produce finished DCT output, final step is fused into the
+        /// quantization step. Quantization and de-quantization coefficients
+        /// must be multiplied by these values.
        /// </para>
        /// <para>
-        /// These values were calculates with this formula:
-        /// <code>
-        /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
-        /// </code>
-        /// Where:
+        /// Given values were generated by formula:
        /// <code>
+        /// scalefactor[row] * scalefactor[col], where
        /// scalefactor[0] = 1
-        /// </code>
-        /// <code>
        /// scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
        /// </code>
-        /// Values are also scaled by 8 so DCT code won't do extra division/multiplication.
        /// </para>
        /// </remarks>
-        internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[]
+        private static readonly float[] AdjustmentCoefficients = new float[]
        {
-            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
-            0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
-            0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
-            0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
-            0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
-            0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
-            0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
-            0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
+            1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f,
+            1.3870399f, 1.9238797f, 1.812255f, 1.6309863f, 1.3870399f, 1.0897902f, 0.7506606f, 0.38268346f,
+            1.306563f, 1.812255f, 1.707107f, 1.5363555f, 1.306563f, 1.02656f, 0.7071068f, 0.36047992f,
+            1.1758755f, 1.6309863f, 1.5363555f, 1.3826833f, 1.1758755f, 0.9238795f, 0.63637924f, 0.32442334f,
+            1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f,
+            0.78569496f, 1.0897902f, 1.02656f, 0.9238795f, 0.78569496f, 0.61731654f, 0.42521507f, 0.21677275f,
+            0.5411961f, 0.7506606f, 0.7071068f, 0.63637924f, 0.5411961f, 0.42521507f, 0.29289323f, 0.14931567f,
+            0.27589938f, 0.38268346f, 0.36047992f, 0.32442334f, 0.27589938f, 0.21677275f, 0.14931567f, 0.076120466f,
        };

        /// <summary>
-        /// Adjusts given quantization table to be complient with FDCT implementation.
+        /// Adjusts given quantization table for usage with <see cref="TransformIDCT"/>.
+        /// </summary>
+        /// <param name="quantTable">Quantization table to adjust.</param>
+        public static void AdjustToIDCT(ref Block8x8F quantTable)
+        {
+            ref float tableRef = ref Unsafe.As<Block8x8F, float>(ref quantTable);
+            ref float multipliersRef = ref MemoryMarshal.GetReference<float>(AdjustmentCoefficients);
+            for (nint i = 0; i < Block8x8F.Size; i++)
+            {
+                tableRef = 0.125f * tableRef * Unsafe.Add(ref multipliersRef, i);
+                tableRef = ref Unsafe.Add(ref tableRef, 1);
+            }
+
+            // Spectral macroblocks are transposed before quantization
+            // so we must transpose quantization table
+            quantTable.TransposeInplace();
+        }
+
+        /// <summary>
+        /// Adjusts given quantization table for usage with <see cref="TransformFDCT"/>.
+        /// </summary>
+        /// <param name="quantTable">Quantization table to adjust.</param>
+        public static void AdjustToFDCT(ref Block8x8F quantTable)
+        {
+            ref float tableRef = ref Unsafe.As<Block8x8F, float>(ref quantTable);
+            ref float multipliersRef = ref MemoryMarshal.GetReference<float>(AdjustmentCoefficients);
+            for (nint i = 0; i < Block8x8F.Size; i++)
+            {
+                tableRef = 0.125f / (tableRef * Unsafe.Add(ref multipliersRef, i));
+                tableRef = ref Unsafe.Add(ref tableRef, 1);
+            }
+        }
+
+        /// <summary>
+        /// Apply 2D floating point IDCT inplace.
        /// </summary>
        /// <remarks>
-        /// See <see cref="DctReciprocalAdjustmentCoefficients"/> docs for explanation.
+        /// Input block must be dequantized before this method with table
+        /// adjusted by <see cref="AdjustToIDCT"/>.
        /// </remarks>
-        /// <param name="quantizationtable">Quantization table to adjust.</param>
-        public static void AdjustToFDCT(ref Block8x8F quantizationtable)
+        /// <param name="block">Input block.</param>
+        public static void TransformIDCT(ref Block8x8F block)
        {
-            for (int i = 0; i < Block8x8F.Size; i++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported)
            {
-                quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i];
+                IDCT8x8_Avx(ref block);
+            }
+            else
+#endif
+            {
+                IDCT_Vector4(ref block);
            }
        }

        /// <summary>
-        /// Apply 2D floating point FDCT inplace.
+        /// Apply 2D floating point IDCT inplace.
        /// </summary>
-        /// <param name="block">Input matrix.</param>
+        /// <remarks>
+        /// Input block must be quantized after this method with table adjusted
+        /// by <see cref="AdjustToFDCT"/>.
+        /// </remarks>
+        /// <param name="block">Input block.</param>
        public static void TransformFDCT(ref Block8x8F block)
        {
 #if SUPPORTS_RUNTIME_INTRINSICS
            if (Avx.IsSupported)
            {
-                ForwardTransform_Avx(ref block);
+                FDCT8x8_Avx(ref block);
            }
            else
 #endif
            if (Vector.IsHardwareAccelerated)
            {
-                ForwardTransform_Vector4(ref block);
+                FDCT_Vector4(ref block);
            }
            else
            {
-                ForwardTransform_Scalar(ref block);
+                FDCT_Scalar(ref block);
+            }
+        }
+
+        /// <summary>
+        /// Apply floating point IDCT inplace using <see cref="Vector4"/> API.
+        /// </summary>
+        /// <param name="transposedBlock">Input block.</param>
+        private static void IDCT_Vector4(ref Block8x8F transposedBlock)
+        {
+            DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
+
+            // First pass - process columns
+            IDCT8x4_Vector4(ref transposedBlock.V0L);
+            IDCT8x4_Vector4(ref transposedBlock.V0R);
+
+            // Second pass - process rows
+            transposedBlock.TransposeInplace();
+            IDCT8x4_Vector4(ref transposedBlock.V0L);
+            IDCT8x4_Vector4(ref transposedBlock.V0R);
+
+            // Applies 1D floating point IDCT inplace on 8x4 part of 8x8 block
+            static void IDCT8x4_Vector4(ref Vector4 vecRef)
+            {
+                // Even part
+                Vector4 tmp0 = Unsafe.Add(ref vecRef, 0 * 2);
+                Vector4 tmp1 = Unsafe.Add(ref vecRef, 2 * 2);
+                Vector4 tmp2 = Unsafe.Add(ref vecRef, 4 * 2);
+                Vector4 tmp3 = Unsafe.Add(ref vecRef, 6 * 2);
+
+                Vector4 z5 = tmp0;
+                Vector4 tmp10 = z5 + tmp2;
+                Vector4 tmp11 = z5 - tmp2;
+
+                Vector4 tmp13 = tmp1 + tmp3;
+                Vector4 tmp12 = ((tmp1 - tmp3) * mm128_F_1_4142) - tmp13;
+
+                tmp0 = tmp10 + tmp13;
+                tmp3 = tmp10 - tmp13;
+                tmp1 = tmp11 + tmp12;
+                tmp2 = tmp11 - tmp12;
+
+                // Odd part
+                Vector4 tmp4 = Unsafe.Add(ref vecRef, 1 * 2);
+                Vector4 tmp5 = Unsafe.Add(ref vecRef, 3 * 2);
+                Vector4 tmp6 = Unsafe.Add(ref vecRef, 5 * 2);
+                Vector4 tmp7 = Unsafe.Add(ref vecRef, 7 * 2);
+
+                Vector4 z13 = tmp6 + tmp5;
+                Vector4 z10 = tmp6 - tmp5;
+                Vector4 z11 = tmp4 + tmp7;
+                Vector4 z12 = tmp4 - tmp7;
+
+                tmp7 = z11 + z13;
+                tmp11 = (z11 - z13) * mm128_F_1_4142;
+
+                z5 = (z10 + z12) * mm128_F_1_8477;
+
+                tmp10 = (z12 * mm128_F_n1_0823) + z5;
+                tmp12 = (z10 * mm128_F_n2_6131) + z5;
+
+                tmp6 = tmp12 - tmp7;
+                tmp5 = tmp11 - tmp6;
+                tmp4 = tmp10 - tmp5;
+
+                Unsafe.Add(ref vecRef, 0 * 2) = tmp0 + tmp7;
+                Unsafe.Add(ref vecRef, 7 * 2) = tmp0 - tmp7;
+                Unsafe.Add(ref vecRef, 1 * 2) = tmp1 + tmp6;
+                Unsafe.Add(ref vecRef, 6 * 2) = tmp1 - tmp6;
+                Unsafe.Add(ref vecRef, 2 * 2) = tmp2 + tmp5;
+                Unsafe.Add(ref vecRef, 5 * 2) = tmp2 - tmp5;
+                Unsafe.Add(ref vecRef, 3 * 2) = tmp3 + tmp4;
+                Unsafe.Add(ref vecRef, 4 * 2) = tmp3 - tmp4;
            }
        }

@ -120,8 +221,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// <remarks>
        /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
        /// </remarks>
-        /// <param name="block">Input matrix.</param>
-        private static void ForwardTransform_Scalar(ref Block8x8F block)
+        /// <param name="block">Input block.</param>
+        private static void FDCT_Scalar(ref Block8x8F block)
        {
            const int dctSize = 8;

@ -130,17 +231,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            float z1, z2, z3, z4, z5, z11, z13;

            // First pass - process rows
-            ref float dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
+            ref float blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
            for (int ctr = 7; ctr >= 0; ctr--)
            {
-                tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7);
-                tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7);
-                tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6);
-                tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6);
-                tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5);
-                tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5);
-                tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4);
-                tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4);
+                tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7);
+                tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7);
+                tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6);
+                tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6);
+                tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5);
+                tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5);
+                tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4);
+                tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4);

                // Even part
                tmp10 = tmp0 + tmp3;
@ -148,12 +249,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                tmp11 = tmp1 + tmp2;
                tmp12 = tmp1 - tmp2;

-                Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11;
-                Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11;
+                Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
+                Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11;

                z1 = (tmp12 + tmp13) * 0.707106781f;
-                Unsafe.Add(ref dataRef, 2) = tmp13 + z1;
-                Unsafe.Add(ref dataRef, 6) = tmp13 - z1;
+                Unsafe.Add(ref blockRef, 2) = tmp13 + z1;
+                Unsafe.Add(ref blockRef, 6) = tmp13 - z1;

                // Odd part
                tmp10 = tmp4 + tmp5;
@ -168,26 +269,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                z11 = tmp7 + z3;
                z13 = tmp7 - z3;

-                Unsafe.Add(ref dataRef, 5) = z13 + z2;
-                Unsafe.Add(ref dataRef, 3) = z13 - z2;
-                Unsafe.Add(ref dataRef, 1) = z11 + z4;
-                Unsafe.Add(ref dataRef, 7) = z11 - z4;
+                Unsafe.Add(ref blockRef, 5) = z13 + z2;
+                Unsafe.Add(ref blockRef, 3) = z13 - z2;
+                Unsafe.Add(ref blockRef, 1) = z11 + z4;
+                Unsafe.Add(ref blockRef, 7) = z11 - z4;

-                dataRef = ref Unsafe.Add(ref dataRef, dctSize);
+                blockRef = ref Unsafe.Add(ref blockRef, dctSize);
            }

            // Second pass - process columns
-            dataRef = ref Unsafe.As<Block8x8F, float>(ref block);
+            blockRef = ref Unsafe.As<Block8x8F, float>(ref block);
            for (int ctr = 7; ctr >= 0; ctr--)
            {
-                tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7);
-                tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7);
-                tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6);
-                tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6);
-                tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5);
-                tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5);
-                tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4);
-                tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4);
+                tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7);
+                tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7);
+                tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6);
+                tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6);
+                tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5);
+                tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5);
+                tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4);
+                tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4);

                // Even part
                tmp10 = tmp0 + tmp3;
@ -195,12 +296,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                tmp11 = tmp1 + tmp2;
                tmp12 = tmp1 - tmp2;

-                Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11;
-                Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11;
+                Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11;
+                Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11;

                z1 = (tmp12 + tmp13) * 0.707106781f;
-                Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1;
-                Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1;
+                Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1;
+                Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1;

                // Odd part
                tmp10 = tmp4 + tmp5;
@ -215,12 +316,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
                z11 = tmp7 + z3;
                z13 = tmp7 - z3;

-                Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2;
-                Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2;
-                Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4;
-                Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4;
+                Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2;
+                Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2;
+                Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4;
+                Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4;

-                dataRef = ref Unsafe.Add(ref dataRef, 1);
+                blockRef = ref Unsafe.Add(ref blockRef, 1);
            }
        }

@ -230,11 +331,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// <remarks>
        /// This implementation must be called only if hardware supports 4
        /// floating point numbers vector. Otherwise explicit scalar
-        /// implementation <see cref="ForwardTransform_Scalar"/> is faster
-        /// because it does not rely on matrix transposition.
+        /// implementation <see cref="FDCT_Scalar"/> is faster
+        /// because it does not rely on block transposition.
        /// </remarks>
-        /// <param name="block">Input matrix.</param>
-        private static void ForwardTransform_Vector4(ref Block8x8F block)
+        /// <param name="block">Input block.</param>
+        public static void FDCT_Vector4(ref Block8x8F block)
        {
            DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");

@ -247,209 +348,50 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            block.TransposeInplace();
            FDCT8x4_Vector4(ref block.V0L);
            FDCT8x4_Vector4(ref block.V0R);
-        }

-        /// <summary>
-        /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix.
-        /// </summary>
-        /// <remarks>
-        /// Implemented using Vector4 API operations for either scalar or sse hardware implementation.
-        /// Must be called on both 8x4 matrix parts for the full FDCT transform.
-        /// </remarks>
-        /// <param name="blockRef">Input reference to the first </param>
-        private static void FDCT8x4_Vector4(ref Vector4 blockRef)
-        {
-            Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14);
-            Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14);
-            Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12);
-            Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12);
-            Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10);
-            Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10);
-            Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8);
-            Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8);
-
-            // Even part
-            Vector4 tmp10 = tmp0 + tmp3;
-            Vector4 tmp13 = tmp0 - tmp3;
-            Vector4 tmp11 = tmp1 + tmp2;
-            Vector4 tmp12 = tmp1 - tmp2;
-
-            Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
-            Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11;
-
-            Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
-            Unsafe.Add(ref blockRef, 4) = tmp13 + z1;
-            Unsafe.Add(ref blockRef, 12) = tmp13 - z1;
-
-            // Odd part
-            tmp10 = tmp4 + tmp5;
-            tmp11 = tmp5 + tmp6;
-            tmp12 = tmp6 + tmp7;
-
-            Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
-            Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
-            Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
-            Vector4 z3 = tmp11 * mm128_F_0_7071;
-
-            Vector4 z11 = tmp7 + z3;
-            Vector4 z13 = tmp7 - z3;
-
-            Unsafe.Add(ref blockRef, 10) = z13 + z2;
-            Unsafe.Add(ref blockRef, 6) = z13 - z2;
-            Unsafe.Add(ref blockRef, 2) = z11 + z4;
-            Unsafe.Add(ref blockRef, 14) = z11 - z4;
-        }
+            // Applies 1D floating point FDCT inplace on 8x4 part of 8x8 block
+            static void FDCT8x4_Vector4(ref Vector4 vecRef)
+            {
+                Vector4 tmp0 = Unsafe.Add(ref vecRef, 0) + Unsafe.Add(ref vecRef, 14);
+                Vector4 tmp7 = Unsafe.Add(ref vecRef, 0) - Unsafe.Add(ref vecRef, 14);
+                Vector4 tmp1 = Unsafe.Add(ref vecRef, 2) + Unsafe.Add(ref vecRef, 12);
+                Vector4 tmp6 = Unsafe.Add(ref vecRef, 2) - Unsafe.Add(ref vecRef, 12);
+                Vector4 tmp2 = Unsafe.Add(ref vecRef, 4) + Unsafe.Add(ref vecRef, 10);
+                Vector4 tmp5 = Unsafe.Add(ref vecRef, 4) - Unsafe.Add(ref vecRef, 10);
+                Vector4 tmp3 = Unsafe.Add(ref vecRef, 6) + Unsafe.Add(ref vecRef, 8);
+                Vector4 tmp4 = Unsafe.Add(ref vecRef, 6) - Unsafe.Add(ref vecRef, 8);

-        /// <summary>
-        /// Apply floating point IDCT inplace.
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
-        /// </summary>
-        /// <param name="block">Input matrix.</param>
-        /// <param name="temp">Matrix to store temporal results.</param>
-        public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
-        {
-            block.TransposeInplace();
-            IDCT8x8(ref block, ref temp);
-            temp.TransposeInplace();
-            IDCT8x8(ref temp, ref block);
+                // Even part
+                Vector4 tmp10 = tmp0 + tmp3;
+                Vector4 tmp13 = tmp0 - tmp3;
+                Vector4 tmp11 = tmp1 + tmp2;
+                Vector4 tmp12 = tmp1 - tmp2;

-            // TODO: This can be fused into quantization table step
-            block.MultiplyInPlace(C_0_125);
-        }
+                Unsafe.Add(ref vecRef, 0) = tmp10 + tmp11;
+                Unsafe.Add(ref vecRef, 8) = tmp10 - tmp11;

-        /// <summary>
-        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
-            {
-                IDCT8x8_Avx(ref s, ref d);
-            }
-            else
-#endif
-            {
-                IDCT8x4_LeftPart(ref s, ref d);
-                IDCT8x4_RightPart(ref s, ref d);
-            }
-        }
+                Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
+                Unsafe.Add(ref vecRef, 4) = tmp13 + z1;
+                Unsafe.Add(ref vecRef, 12) = tmp13 - z1;

-        /// <summary>
-        /// Do IDCT internal operations on the left part of the block. Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">Destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1L;
-            Vector4 my7 = s.V7L;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3L;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5L;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2L;
-            Vector4 my6 = s.V6L;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0L;
-            Vector4 my4 = s.V4L;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0L = my0 + mb0;
-            d.V7L = my0 - mb0;
-            d.V1L = my1 + mb1;
-            d.V6L = my1 - mb1;
-            d.V2L = my2 + mb2;
-            d.V5L = my2 - mb2;
-            d.V3L = my3 + mb3;
-            d.V4L = my3 - mb3;
-        }
+                // Odd part
+                tmp10 = tmp4 + tmp5;
+                tmp11 = tmp5 + tmp6;
+                tmp12 = tmp6 + tmp7;

-        /// <summary>
-        /// Do IDCT internal operations on the right part of the block.
-        /// Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">The destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1R;
-            Vector4 my7 = s.V7R;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3R;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5R;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2R;
-            Vector4 my6 = s.V6R;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0R;
-            Vector4 my4 = s.V4R;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0R = my0 + mb0;
-            d.V7R = my0 - mb0;
-            d.V1R = my1 + mb1;
-            d.V6R = my1 - mb1;
-            d.V2R = my2 + mb2;
-            d.V5R = my2 - mb2;
-            d.V3R = my3 + mb3;
-            d.V4R = my3 - mb3;
+                Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
+                Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
+                Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
+                Vector4 z3 = tmp11 * mm128_F_0_7071;
+
+                Vector4 z11 = tmp7 + z3;
+                Vector4 z13 = tmp7 - z3;
+
+                Unsafe.Add(ref vecRef, 10) = z13 + z2;
+                Unsafe.Add(ref vecRef, 6) = z13 - z2;
+                Unsafe.Add(ref vecRef, 2) = z11 + z4;
+                Unsafe.Add(ref vecRef, 14) = z11 - z4;
+            }
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
@ -35,5 +35,34 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            63, 63, 63, 63, 63, 63, 63, 63,
            63, 63, 63, 63, 63, 63, 63, 63
        };
+
+        /// <summary>
+        /// Gets span of zig-zag with fused transpose step ordering indices.
+        /// </summary>
+        /// <remarks>
+        /// When reading corrupted data, the Huffman decoders could attempt
+        /// to reference an entry beyond the end of this array (if the decoded
+        /// zero run length reaches past the end of the block).  To prevent
+        /// wild stores without adding an inner-loop test, we put some extra
+        /// "63"s after the real entries.  This will cause the extra coefficient
+        /// to be stored in location 63 of the block, not somewhere random.
+        /// The worst case would be a run-length of 15, which means we need 16
+        /// fake entries.
+        /// </remarks>
+        public static ReadOnlySpan<byte> TransposingOrder => new byte[]
+        {
+            0,  8,  1,  2,  9,  16, 24, 17,
+            10, 3,  4,  11, 18, 25, 32, 40,
+            33, 26, 19, 12, 5,  6,  13, 20,
+            27, 34, 41, 48, 56, 49, 42, 35,
+            28, 21, 14, 7,  15, 22, 29, 36,
+            43, 50, 57, 58, 51, 44, 37, 30,
+            23, 31, 38, 45, 52, 59, 60, 53,
+            46, 39, 47, 54, 61, 62, 55, 63,
+
+            // Extra entries for safety in decoder
+            63, 63, 63, 63, 63, 63, 63, 63,
+            63, 63, 63, 63, 63, 63, 63, 63
+        };
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
@ -942,6 +942,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
                        break;
                    }
                }
+
+                // Adjusting table for IDCT step during decompression
+                FastFloatingPointDCT.AdjustToIDCT(ref table);
            }
        }

--- a/src/ImageSharp/Formats/Webp/EntropyIx.cs
+++ b/src/ImageSharp/Formats/Webp/EntropyIx.cs
@ -6,7 +6,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
    /// <summary>
    /// These five modes are evaluated and their respective entropy is computed.
    /// </summary>
-    internal enum EntropyIx
+    internal enum EntropyIx : byte
    {
        Direct = 0,

--- a/src/ImageSharp/Formats/Webp/HistoIx.cs
+++ b/src/ImageSharp/Formats/Webp/HistoIx.cs
@ -3,7 +3,7 @@

 namespace SixLabors.ImageSharp.Formats.Webp
 {
-    internal enum HistoIx
+    internal enum HistoIx : byte
    {
        HistoAlpha = 0,

--- a/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs
@ -2,11 +2,13 @@
 // Licensed under the Apache License, Version 2.0.

 using System;
+using System.Buffers;
 using System.Collections.Generic;
+using SixLabors.ImageSharp.Memory;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 {
-    internal class BackwardReferenceEncoder
+    internal static class BackwardReferenceEncoder
    {
        /// <summary>
        /// Maximum bit length.
@ -41,6 +43,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            int quality,
            int lz77TypesToTry,
            ref int cacheBits,
+            MemoryAllocator memoryAllocator,
            Vp8LHashChain hashChain,
            Vp8LBackwardRefs best,
            Vp8LBackwardRefs worst)
@ -69,7 +72,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                        BackwardReferencesLz77(width, height, bgra, 0, hashChain, worst);
                        break;
                    case Vp8LLz77Type.Lz77Box:
-                        hashChainBox = new Vp8LHashChain(width * height);
+                        hashChainBox = new Vp8LHashChain(memoryAllocator, width * height);
                        BackwardReferencesLz77Box(width, height, bgra, 0, hashChain, hashChainBox, worst);
                        break;
                }
@ -100,7 +103,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            if ((lz77TypeBest == (int)Vp8LLz77Type.Lz77Standard || lz77TypeBest == (int)Vp8LLz77Type.Lz77Box) && quality >= 25)
            {
                Vp8LHashChain hashChainTmp = lz77TypeBest == (int)Vp8LLz77Type.Lz77Standard ? hashChain : hashChainBox;
-                BackwardReferencesTraceBackwards(width, height, bgra, cacheBits, hashChainTmp, best, worst);
+                BackwardReferencesTraceBackwards(width, height, memoryAllocator, bgra, cacheBits, hashChainTmp, best, worst);
                var histo = new Vp8LHistogram(worst, cacheBits);
                double bitCostTrace = histo.EstimateBits(stats, bitsEntropy);
                if (bitCostTrace < bitCostBest)
@ -111,6 +114,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

            BackwardReferences2DLocality(width, best);

+            hashChainBox?.Dispose();
+
            return best;
        }

@ -234,6 +239,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        private static void BackwardReferencesTraceBackwards(
            int xSize,
            int ySize,
+            MemoryAllocator memoryAllocator,
            ReadOnlySpan<uint> bgra,
            int cacheBits,
            Vp8LHashChain hashChain,
@ -241,22 +247,24 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            Vp8LBackwardRefs refsDst)
        {
            int distArraySize = xSize * ySize;
-            ushort[] distArray = new ushort[distArraySize];
+            using IMemoryOwner<ushort> distArrayBuffer = memoryAllocator.Allocate<ushort>(distArraySize);
+            Span<ushort> distArray = distArrayBuffer.GetSpan();

-            BackwardReferencesHashChainDistanceOnly(xSize, ySize, bgra, cacheBits, hashChain, refsSrc, distArray);
+            BackwardReferencesHashChainDistanceOnly(xSize, ySize, memoryAllocator, bgra, cacheBits, hashChain, refsSrc, distArrayBuffer);
            int chosenPathSize = TraceBackwards(distArray, distArraySize);
-            Span<ushort> chosenPath = distArray.AsSpan(distArraySize - chosenPathSize);
+            Span<ushort> chosenPath = distArray.Slice(distArraySize - chosenPathSize);
            BackwardReferencesHashChainFollowChosenPath(bgra, cacheBits, chosenPath, chosenPathSize, hashChain, refsDst);
        }

        private static void BackwardReferencesHashChainDistanceOnly(
            int xSize,
            int ySize,
+            MemoryAllocator memoryAllocator,
            ReadOnlySpan<uint> bgra,
            int cacheBits,
            Vp8LHashChain hashChain,
            Vp8LBackwardRefs refs,
-            ushort[] distArray)
+            IMemoryOwner<ushort> distArrayBuffer)
        {
            int pixCount = xSize * ySize;
            bool useColorCache = cacheBits > 0;
@ -275,22 +283,24 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            }

            costModel.Build(xSize, cacheBits, refs);
-            var costManager = new CostManager(distArray, pixCount, costModel);
+            using var costManager = new CostManager(memoryAllocator, distArrayBuffer, pixCount, costModel);
+            Span<float> costManagerCosts = costManager.Costs.GetSpan();
+            Span<ushort> distArray = distArrayBuffer.GetSpan();

            // We loop one pixel at a time, but store all currently best points to non-processed locations from this point.
            distArray[0] = 0;

            // Add first pixel as literal.
-            AddSingleLiteralWithCostModel(bgra, colorCache, costModel, 0, useColorCache, 0.0f, costManager.Costs, distArray);
+            AddSingleLiteralWithCostModel(bgra, colorCache, costModel, 0, useColorCache, 0.0f, costManagerCosts, distArray);

            for (int i = 1; i < pixCount; i++)
            {
-                float prevCost = costManager.Costs[i - 1];
+                float prevCost = costManagerCosts[i - 1];
                int offset = hashChain.FindOffset(i);
                int len = hashChain.FindLength(i);

                // Try adding the pixel as a literal.
-                AddSingleLiteralWithCostModel(bgra, colorCache, costModel, i, useColorCache, prevCost, costManager.Costs, distArray);
+                AddSingleLiteralWithCostModel(bgra, colorCache, costModel, i, useColorCache, prevCost, costManagerCosts, distArray);

                // If we are dealing with a non-literal.
                if (len >= 2)
@ -334,7 +344,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                            costManager.UpdateCostAtIndex(j - 1, false);
                            costManager.UpdateCostAtIndex(j, false);

-                            costManager.PushInterval(costManager.Costs[j - 1] + offsetCost, j, lenJ);
+                            costManager.PushInterval(costManagerCosts[j - 1] + offsetCost, j, lenJ);
                            reach = j + lenJ - 1;
                        }
                    }
@ -346,7 +356,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            }
        }

-        private static int TraceBackwards(ushort[] distArray, int distArraySize)
+        private static int TraceBackwards(Span<ushort> distArray, int distArraySize)
        {
            int chosenPathSize = 0;
            int pathPos = distArraySize;
@ -426,8 +436,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            int idx,
            bool useColorCache,
            float prevCost,
-            float[] cost,
-            ushort[] distArray)
+            Span<float> cost,
+            Span<ushort> distArray)
        {
            double costVal = prevCost;
            uint color = bgra[idx];
@ -617,7 +627,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                }
            }

-            hashChain.OffsetLength[0] = 0;
+            Span<uint> hashChainOffsetLength = hashChain.OffsetLength.GetSpan();
+            hashChainOffsetLength[0] = 0;
            for (i = 1; i < pixelCount; i++)
            {
                int ind;
@ -695,19 +706,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

                if (bestLength <= MinLength)
                {
-                    hashChain.OffsetLength[i] = 0;
+                    hashChainOffsetLength[i] = 0;
                    bestOffsetPrev = 0;
                    bestLengthPrev = 0;
                }
                else
                {
-                    hashChain.OffsetLength[i] = (uint)((bestOffset << MaxLengthBits) | bestLength);
+                    hashChainOffsetLength[i] = (uint)((bestOffset << MaxLengthBits) | bestLength);
                    bestOffsetPrev = bestOffset;
                    bestLengthPrev = bestLength;
                }
            }

-            hashChain.OffsetLength[0] = 0;
+            hashChainOffsetLength[0] = 0;
            BackwardReferencesLz77(xSize, ySize, bgra, cacheBits, hashChain, refs);
        }

--- a/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs
@ -1,7 +1,10 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

+using System;
+using System.Buffers;
 using System.Collections.Generic;
+using SixLabors.ImageSharp.Memory;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 {
@ -10,20 +13,29 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
    /// It caches the different CostCacheInterval, caches the different
    /// GetLengthCost(costModel, k) in costCache and the CostInterval's.
    /// </summary>
-    internal class CostManager
+    internal sealed class CostManager : IDisposable
    {
        private CostInterval head;

-        public CostManager(ushort[] distArray, int pixCount, CostModel costModel)
+        private const int FreeIntervalsStartCount = 25;
+
+        private readonly Stack<CostInterval> freeIntervals = new(FreeIntervalsStartCount);
+
+        public CostManager(MemoryAllocator memoryAllocator, IMemoryOwner<ushort> distArray, int pixCount, CostModel costModel)
        {
            int costCacheSize = pixCount > BackwardReferenceEncoder.MaxLength ? BackwardReferenceEncoder.MaxLength : pixCount;

            this.CacheIntervals = new List<CostCacheInterval>();
            this.CostCache = new List<double>();
-            this.Costs = new float[pixCount];
+            this.Costs = memoryAllocator.Allocate<float>(pixCount);
            this.DistArray = distArray;
            this.Count = 0;

+            for (int i = 0; i < FreeIntervalsStartCount; i++)
+            {
+                this.freeIntervals.Push(new CostInterval());
+            }
+
            // Fill in the cost cache.
            this.CacheIntervalsSize++;
            this.CostCache.Add(costModel.GetLengthCost(0));
@ -64,10 +76,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            }

            // Set the initial costs high for every pixel as we will keep the minimum.
-            for (int i = 0; i < pixCount; i++)
-            {
-                this.Costs[i] = 1e38f;
-            }
+            this.Costs.GetSpan().Fill(1e38f);
        }

        /// <summary>
@ -82,9 +91,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

        public int CacheIntervalsSize { get; }

-        public float[] Costs { get; }
+        public IMemoryOwner<float> Costs { get; }

-        public ushort[] DistArray { get; }
+        public IMemoryOwner<ushort> DistArray { get; }

        public List<CostCacheInterval> CacheIntervals { get; }

@ -128,6 +137,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            // interval logic, just serialize it right away. This constant is empirical.
            int skipDistance = 10;

+            Span<float> costs = this.Costs.GetSpan();
+            Span<ushort> distArray = this.DistArray.GetSpan();
            if (len < skipDistance)
            {
                for (int j = position; j < position + len; j++)
@ -135,10 +146,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                    int k = j - position;
                    float costTmp = (float)(distanceCost + this.CostCache[k]);

-                    if (this.Costs[j] > costTmp)
+                    if (costs[j] > costTmp)
                    {
-                        this.Costs[j] = costTmp;
-                        this.DistArray[j] = (ushort)(k + 1);
+                        costs[j] = costTmp;
+                        distArray[j] = (ushort)(k + 1);
                    }
                }

@ -201,10 +212,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                            this.InsertInterval(interval, interval.Cost, interval.Index, end, endOriginal);
                            break;
                        }
-                        else
-                        {
-                            interval.End = start;
-                        }
+
+                        interval.End = start;
                    }
                }

@ -226,6 +235,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

            this.ConnectIntervals(interval.Previous, interval.Next);
            this.Count--;
+
+            interval.Next = null;
+            interval.Previous = null;
+            this.freeIntervals.Push(interval);
        }

        private void InsertInterval(CostInterval intervalIn, float cost, int position, int start, int end)
@ -236,13 +249,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            }

            // TODO: should we use COST_CACHE_INTERVAL_SIZE_MAX?
-            var intervalNew = new CostInterval()
+            CostInterval intervalNew;
+            if (this.freeIntervals.Count > 0)
            {
-                Cost = cost,
-                Start = start,
-                End = end,
-                Index = position
-            };
+                intervalNew = this.freeIntervals.Pop();
+                intervalNew.Cost = cost;
+                intervalNew.Start = start;
+                intervalNew.End = end;
+                intervalNew.Index = position;
+            }
+            else
+            {
+                intervalNew = new CostInterval() { Cost = cost, Start = start, End = end, Index = position };
+            }

            this.PositionOrphanInterval(intervalNew, intervalIn);
            this.Count++;
@ -297,12 +316,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        /// </summary>
        private void UpdateCost(int i, int position, float cost)
        {
+            Span<float> costs = this.Costs.GetSpan();
+            Span<ushort> distArray = this.DistArray.GetSpan();
            int k = i - position;
-            if (this.Costs[i] > cost)
+            if (costs[i] > cost)
            {
-                this.Costs[i] = cost;
-                this.DistArray[i] = (ushort)(k + 1);
+                costs[i] = cost;
+                distArray[i] = (ushort)(k + 1);
            }
        }
+
+        /// <inheritdoc />
+        public void Dispose() => this.Costs.Dispose();
    }
 }
--- a/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs
@ -13,16 +13,16 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
    ///  - UsePackedTable: few enough literal symbols, so all the bit codes can fit into a small look-up table PackedTable[]
    /// The common literal base, if applicable, is stored in 'LiteralArb'.
    /// </summary>
-    internal class HTreeGroup
+    internal struct HTreeGroup
    {
        public HTreeGroup(uint packedTableSize)
        {
            this.HTrees = new List<HuffmanCode[]>(WebpConstants.HuffmanCodesPerMetaCode);
            this.PackedTable = new HuffmanCode[packedTableSize];
-            for (int i = 0; i < packedTableSize; i++)
-            {
-                this.PackedTable[i] = new HuffmanCode();
-            }
+            this.IsTrivialCode = false;
+            this.IsTrivialLiteral = false;
+            this.LiteralArb = 0;
+            this.UsePackedTable = false;
        }

        /// <summary>
--- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanCode.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanCode.cs
@ -9,7 +9,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
    /// A classic way to do entropy coding where a smaller number of bits are used for more frequent codes.
    /// </summary>
    [DebuggerDisplay("BitsUsed: {BitsUsed}, Value: {Value}")]
-    internal class HuffmanCode
+    internal struct HuffmanCode
    {
        /// <summary>
        /// Gets or sets the number of bits used for this symbol.
--- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs
@ -9,7 +9,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
    /// Represents the Huffman tree.
    /// </summary>
    [DebuggerDisplay("TotalCount = {TotalCount}, Value = {Value}, Left = {PoolIndexLeft}, Right = {PoolIndexRight}")]
-    internal struct HuffmanTree : IDeepCloneable
+    internal struct HuffmanTree
    {
        /// <summary>
        /// Initializes a new instance of the <see cref="HuffmanTree"/> struct.
@ -57,7 +57,5 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

            return t1.Value < t2.Value ? -1 : 1;
        }
-
-        public IDeepCloneable DeepClone() => new HuffmanTree(this);
    }
 }
--- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs
@ -2,6 +2,7 @@
 // Licensed under the Apache License, Version 2.0.

 using System;
+using System.Runtime.CompilerServices;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 {
@ -218,8 +219,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                    while (treeSize > 1)
                    {
                        // Finish when we have only one root.
-                        treePool[treePoolSize++] = (HuffmanTree)tree[treeSize - 1].DeepClone();
-                        treePool[treePoolSize++] = (HuffmanTree)tree[treeSize - 2].DeepClone();
+                        treePool[treePoolSize++] = tree[treeSize - 1];
+                        treePool[treePoolSize++] = tree[treeSize - 2];
                        int count = treePool[treePoolSize - 1].TotalCount + treePool[treePoolSize - 2].TotalCount;
                        treeSize -= 2;

@ -238,7 +239,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                        int startIdx = endIdx + num - 1;
                        for (int i = startIdx; i >= endIdx; i--)
                        {
-                            tree[i] = (HuffmanTree)tree[i - 1].DeepClone();
+                            tree[i] = tree[i - 1];
                        }

                        tree[k].TotalCount = count;
@ -307,9 +308,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

        public static int BuildHuffmanTable(Span<HuffmanCode> table, int rootBits, int[] codeLengths, int codeLengthsSize)
        {
-            Guard.MustBeGreaterThan(rootBits, 0, nameof(rootBits));
-            Guard.NotNull(codeLengths, nameof(codeLengths));
-            Guard.MustBeGreaterThan(codeLengthsSize, 0, nameof(codeLengthsSize));
+            DebugGuard.MustBeGreaterThan(rootBits, 0, nameof(rootBits));
+            DebugGuard.NotNull(codeLengths, nameof(codeLengths));
+            DebugGuard.MustBeGreaterThan(codeLengthsSize, 0, nameof(codeLengthsSize));

            // sorted[codeLengthsSize] is a pre-allocated array for sorting symbols by code length.
            int[] sorted = new int[codeLengthsSize];
@ -467,27 +468,27 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

                    break;
                }
-                else if (repetitions < 11)
+
+                if (repetitions < 11)
                {
                    tokens[pos].Code = 17;
                    tokens[pos].ExtraBits = (byte)(repetitions - 3);
                    pos++;
                    break;
                }
-                else if (repetitions < 139)
+
+                if (repetitions < 139)
                {
                    tokens[pos].Code = 18;
                    tokens[pos].ExtraBits = (byte)(repetitions - 11);
                    pos++;
                    break;
                }
-                else
-                {
-                    tokens[pos].Code = 18;
-                    tokens[pos].ExtraBits = 0x7f;  // 138 repeated 0s
-                    pos++;
-                    repetitions -= 138;
-                }
+
+                tokens[pos].Code = 18;
+                tokens[pos].ExtraBits = 0x7f;  // 138 repeated 0s
+                pos++;
+                repetitions -= 138;
            }

            return pos;
@ -519,20 +520,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

                    break;
                }
-                else if (repetitions < 7)
+
+                if (repetitions < 7)
                {
                    tokens[pos].Code = 16;
                    tokens[pos].ExtraBits = (byte)(repetitions - 3);
                    pos++;
                    break;
                }
-                else
-                {
-                    tokens[pos].Code = 16;
-                    tokens[pos].ExtraBits = 3;
-                    pos++;
-                    repetitions -= 6;
-                }
+
+                tokens[pos].Code = 16;
+                tokens[pos].ExtraBits = 3;
+                pos++;
+                repetitions -= 6;
            }

            return pos;
@ -541,7 +541,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        /// <summary>
        /// Get the actual bit values for a tree of bit depths.
        /// </summary>
-        /// <param name="tree">The hiffman tree.</param>
+        /// <param name="tree">The huffman tree.</param>
        private static void ConvertBitDepthsToSymbols(HuffmanTreeCode tree)
        {
            // 0 bit-depth means that the symbol does not exist.
@ -628,7 +628,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        /// </summary>
        private static void ReplicateValue(Span<HuffmanCode> table, int step, int end, HuffmanCode code)
        {
-            Guard.IsTrue(end % step == 0, nameof(end), "end must be a multiple of step");
+            DebugGuard.IsTrue(end % step == 0, nameof(end), "end must be a multiple of step");

            do
            {
@ -656,6 +656,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        /// <summary>
        /// Heuristics for selecting the stride ranges to collapse.
        /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
        private static bool ValuesShouldBeCollapsedToStrideAverage(int a, int b) => Math.Abs(a - b) < 4;
    }
 }
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@ -2,6 +2,7 @@
 // Licensed under the Apache License, Version 2.0.

 using System;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Memory;
@ -80,8 +81,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        public static int VectorMismatch(ReadOnlySpan<uint> array1, ReadOnlySpan<uint> array2, int length)
        {
            int matchLen = 0;
+            ref uint array1Ref = ref MemoryMarshal.GetReference(array1);
+            ref uint array2Ref = ref MemoryMarshal.GetReference(array2);

-            while (matchLen < length && array1[matchLen] == array2[matchLen])
+            while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen))
            {
                matchLen++;
            }
@ -759,28 +762,184 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        /// <returns>Shanon entropy.</returns>
        public static float CombinedShannonEntropy(Span<int> x, Span<int> y)
        {
-            double retVal = 0.0d;
-            uint sumX = 0, sumXY = 0;
-            for (int i = 0; i < 256; i++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
            {
-                uint xi = (uint)x[i];
-                if (xi != 0)
+                double retVal = 0.0d;
+                Vector256<int> tmp = Vector256<int>.Zero;    // has the size of the scratch space of sizeof(int) * 8
+                ref int xRef = ref MemoryMarshal.GetReference(x);
+                ref int yRef = ref MemoryMarshal.GetReference(y);
+                Vector256<int> sumXY256 = Vector256<int>.Zero;
+                Vector256<int> sumX256 = Vector256<int>.Zero;
+                ref int tmpRef = ref Unsafe.As<Vector256<int>, int>(ref tmp);
+                for (nint i = 0; i < 256; i += 8)
                {
-                    uint xy = xi + (uint)y[i];
-                    sumX += xi;
-                    retVal -= FastSLog2(xi);
-                    sumXY += xy;
-                    retVal -= FastSLog2(xy);
+                    Vector256<int> xVec = Unsafe.As<int, Vector256<int>>(ref Unsafe.Add(ref xRef, i));
+                    Vector256<int> yVec = Unsafe.As<int, Vector256<int>>(ref Unsafe.Add(ref yRef, i));
+
+                    // Check if any X is non-zero: this actually provides a speedup as X is usually sparse.
+                    int mask = Avx2.MoveMask(Avx2.CompareEqual(xVec, Vector256<int>.Zero).AsByte());
+                    if (mask != -1)
+                    {
+                        Vector256<int> xy256 = Avx2.Add(xVec, yVec);
+                        sumXY256 = Avx2.Add(sumXY256, xy256);
+                        sumX256 = Avx2.Add(sumX256, xVec);
+
+                        // Analyze the different X + Y.
+                        Unsafe.As<int, Vector256<int>>(ref tmpRef) = xy256;
+                        if (tmpRef != 0)
+                        {
+                            retVal -= FastSLog2((uint)tmpRef);
+                            if (Unsafe.Add(ref xRef, i) != 0)
+                            {
+                                retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i));
+                            }
+                        }
+
+                        if (Unsafe.Add(ref tmpRef, 1) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 1));
+                            if (Unsafe.Add(ref xRef, i + 1) != 0)
+                            {
+                                retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 1));
+                            }
+                        }
+
+                        if (Unsafe.Add(ref tmpRef, 2) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 2));
+                            if (Unsafe.Add(ref xRef, i + 2) != 0)
+                            {
+                                retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 2));
+                            }
+                        }
+
+                        if (Unsafe.Add(ref tmpRef, 3) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 3));
+                            if (Unsafe.Add(ref xRef, i + 3) != 0)
+                            {
+                                retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 3));
+                            }
+                        }
+
+                        if (Unsafe.Add(ref tmpRef, 4) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 4));
+                            if (Unsafe.Add(ref xRef, i + 4) != 0)
+                            {
+                                retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 4));
+                            }
+                        }
+
+                        if (Unsafe.Add(ref tmpRef, 5) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 5));
+                            if (Unsafe.Add(ref xRef, i + 5) != 0)
+                            {
+                                retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 5));
+                            }
+                        }
+
+                        if (Unsafe.Add(ref tmpRef, 6) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 6));
+                            if (Unsafe.Add(ref xRef, i + 6) != 0)
+                            {
+                                retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 6));
+                            }
+                        }
+
+                        if (Unsafe.Add(ref tmpRef, 7) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 7));
+                            if (Unsafe.Add(ref xRef, i + 7) != 0)
+                            {
+                                retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 7));
+                            }
+                        }
+                    }
+                    else
+                    {
+                        // X is fully 0, so only deal with Y.
+                        sumXY256 = Avx2.Add(sumXY256, yVec);
+
+                        if (Unsafe.Add(ref yRef, i) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i));
+                        }
+
+                        if (Unsafe.Add(ref yRef, i + 1) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 1));
+                        }
+
+                        if (Unsafe.Add(ref yRef, i + 2) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 2));
+                        }
+
+                        if (Unsafe.Add(ref yRef, i + 3) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 3));
+                        }
+
+                        if (Unsafe.Add(ref yRef, i + 4) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 4));
+                        }
+
+                        if (Unsafe.Add(ref yRef, i + 5) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 5));
+                        }
+
+                        if (Unsafe.Add(ref yRef, i + 6) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 6));
+                        }
+
+                        if (Unsafe.Add(ref yRef, i + 7) != 0)
+                        {
+                            retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 7));
+                        }
+                    }
                }
-                else if (y[i] != 0)
+
+                // Sum up sumX256 to get sumX and sum up sumXY256 to get sumXY.
+                int sumX = Numerics.ReduceSum(sumX256);
+                int sumXY = Numerics.ReduceSum(sumXY256);
+
+                retVal += FastSLog2((uint)sumX) + FastSLog2((uint)sumXY);
+
+                return (float)retVal;
+            }
+            else
+#endif
+            {
+                double retVal = 0.0d;
+                uint sumX = 0, sumXY = 0;
+                for (int i = 0; i < 256; i++)
                {
-                    sumXY += (uint)y[i];
-                    retVal -= FastSLog2((uint)y[i]);
+                    uint xi = (uint)x[i];
+                    if (xi != 0)
+                    {
+                        uint xy = xi + (uint)y[i];
+                        sumX += xi;
+                        retVal -= FastSLog2(xi);
+                        sumXY += xy;
+                        retVal -= FastSLog2(xy);
+                    }
+                    else if (y[i] != 0)
+                    {
+                        sumXY += (uint)y[i];
+                        retVal -= FastSLog2((uint)y[i]);
+                    }
                }
-            }

-            retVal += FastSLog2(sumX) + FastSLog2(sumXY);
-            return (float)retVal;
+                retVal += FastSLog2(sumX) + FastSLog2(sumXY);
+                return (float)retVal;
+            }
        }

        [MethodImpl(InliningOptions.ShortMethod)]
@ -836,6 +995,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        private static float FastSLog2Slow(uint v)
        {
            DebugGuard.MustBeGreaterThanOrEqualTo<uint>(v, LogLookupIdxMax, nameof(v));
+
            if (v < ApproxLogWithCorrectionMax)
            {
                int logCnt = 0;
@ -865,7 +1025,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

        private static float FastLog2Slow(uint v)
        {
-            Guard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v));
+            DebugGuard.MustBeGreaterThanOrEqualTo<uint>(v, LogLookupIdxMax, nameof(v));

            if (v < ApproxLogWithCorrectionMax)
            {
--- a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs
@ -6,7 +6,7 @@ using System.Diagnostics;
 namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 {
    [DebuggerDisplay("Mode: {Mode}, Len: {Len}, BgraOrDistance: {BgraOrDistance}")]
-    internal class PixOrCopy
+    internal sealed class PixOrCopy
    {
        public PixOrCopyMode Mode { get; set; }

--- a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopyMode.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopyMode.cs
@ -3,7 +3,7 @@

 namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 {
-    internal enum PixOrCopyMode
+    internal enum PixOrCopyMode : byte
    {
        Literal,

--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LBackwardRefs.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LBackwardRefs.cs
@ -7,7 +7,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 {
    internal class Vp8LBackwardRefs
    {
-        public Vp8LBackwardRefs() => this.Refs = new List<PixOrCopy>();
+        public Vp8LBackwardRefs(int pixels) => this.Refs = new List<PixOrCopy>(pixels);

        /// <summary>
        /// Gets or sets the common block-size.
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
@ -124,19 +124,25 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            this.EncodedData = memoryAllocator.Allocate<uint>(pixelCount);
            this.Palette = memoryAllocator.Allocate<uint>(WebpConstants.MaxPaletteSize);
            this.Refs = new Vp8LBackwardRefs[3];
-            this.HashChain = new Vp8LHashChain(pixelCount);
+            this.HashChain = new Vp8LHashChain(memoryAllocator, pixelCount);

            // We round the block size up, so we're guaranteed to have at most MaxRefsBlockPerImage blocks used:
            int refsBlockSize = ((pixelCount - 1) / MaxRefsBlockPerImage) + 1;
            for (int i = 0; i < this.Refs.Length; i++)
            {
-                this.Refs[i] = new Vp8LBackwardRefs
+                this.Refs[i] = new Vp8LBackwardRefs(pixelCount)
                {
                    BlockSize = refsBlockSize < MinBlockSize ? MinBlockSize : refsBlockSize
                };
            }
        }

+        // RFC 1951 will calm you down if you are worried about this funny sequence.
+        // This sequence is tuned from that, but more weighted for lower symbol count,
+        // and more spiking histograms.
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        private static ReadOnlySpan<byte> StorageOrder => new byte[] { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
        // This uses C#'s compiler optimization to refer to assembly's static data directly.
        private static ReadOnlySpan<byte> Order => new byte[] { 1, 2, 0, 3 };

@ -515,7 +521,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            }

            // Calculate backward references from BGRA image.
-            this.HashChain.Fill(this.memoryAllocator, bgra, this.quality, width, height, lowEffort);
+            this.HashChain.Fill(bgra, this.quality, width, height, lowEffort);

            Vp8LBitWriter bitWriterBest = config.SubConfigs.Count > 1 ? this.bitWriter.Clone() : this.bitWriter;
            Vp8LBitWriter bwInit = this.bitWriter;
@ -529,6 +535,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                    this.quality,
                    subConfig.Lz77,
                    ref cacheBits,
+                    this.memoryAllocator,
                    this.HashChain,
                    this.Refs[0],
                    this.Refs[1]);
@ -735,7 +742,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            }

            // Calculate backward references from the image pixels.
-            hashChain.Fill(this.memoryAllocator, bgra, quality, width, height, lowEffort);
+            hashChain.Fill(bgra, quality, width, height, lowEffort);

            Vp8LBackwardRefs refs = BackwardReferenceEncoder.GetBackwardReferences(
                width,
@ -744,6 +751,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                quality,
                (int)Vp8LLz77Type.Lz77Standard | (int)Vp8LLz77Type.Lz77Rle,
                ref cacheBits,
+                this.memoryAllocator,
                hashChain,
                refsTmp1,
                refsTmp2);
@ -940,16 +948,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

        private void StoreHuffmanTreeOfHuffmanTreeToBitMask(byte[] codeLengthBitDepth)
        {
-            // RFC 1951 will calm you down if you are worried about this funny sequence.
-            // This sequence is tuned from that, but more weighted for lower symbol count,
-            // and more spiking histograms.
-            byte[] storageOrder = { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
            // Throw away trailing zeros:
            int codesToStore = WebpConstants.CodeLengthCodes;
            for (; codesToStore > 4; codesToStore--)
            {
-                if (codeLengthBitDepth[storageOrder[codesToStore - 1]] != 0)
+                if (codeLengthBitDepth[StorageOrder[codesToStore - 1]] != 0)
                {
                    break;
                }
@ -958,7 +961,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            this.bitWriter.PutBits((uint)codesToStore - 4, 4);
            for (int i = 0; i < codesToStore; i++)
            {
-                this.bitWriter.PutBits(codeLengthBitDepth[storageOrder[i]], 3);
+                this.bitWriter.PutBits(codeLengthBitDepth[StorageOrder[i]], 3);
            }
        }

@ -1802,6 +1805,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            this.BgraScratch.Dispose();
            this.Palette.Dispose();
            this.TransformData.Dispose();
+            this.HashChain.Dispose();
        }
    }
 }
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs
@ -8,7 +8,7 @@ using SixLabors.ImageSharp.Memory;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 {
-    internal class Vp8LHashChain
+    internal sealed class Vp8LHashChain : IDisposable
    {
        private const uint HashMultiplierHi = 0xc6a4a793u;

@ -28,14 +28,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        /// </summary>
        private const int WindowSize = (1 << WindowSizeBits) - 120;

+        private readonly MemoryAllocator memoryAllocator;
+
        /// <summary>
        /// Initializes a new instance of the <see cref="Vp8LHashChain"/> class.
        /// </summary>
+        /// <param name="memoryAllocator">The memory allocator.</param>
        /// <param name="size">The size off the chain.</param>
-        public Vp8LHashChain(int size)
+        public Vp8LHashChain(MemoryAllocator memoryAllocator, int size)
        {
-            this.OffsetLength = new uint[size];
-            this.OffsetLength.AsSpan().Fill(0xcdcdcdcd);
+            this.memoryAllocator = memoryAllocator;
+            this.OffsetLength = this.memoryAllocator.Allocate<uint>(size, AllocationOptions.Clean);
            this.Size = size;
        }

@ -45,16 +48,16 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        /// These 20 bits are the limit defined by GetWindowSizeForHashChain (through WindowSize = 1 &lt;&lt; 20).
        /// The lower 12 bits contain the length of the match.
        /// </summary>
-        public uint[] OffsetLength { get; }
+        public IMemoryOwner<uint> OffsetLength { get; }

        /// <summary>
        /// Gets the size of the hash chain.
-        /// This is the maximum size of the hash_chain that can be constructed.
+        /// This is the maximum size of the hashchain that can be constructed.
        /// Typically this is the pixel count (width x height) for a given image.
        /// </summary>
        public int Size { get; }

-        public void Fill(MemoryAllocator memoryAllocator, ReadOnlySpan<uint> bgra, int quality, int xSize, int ySize, bool lowEffort)
+        public void Fill(ReadOnlySpan<uint> bgra, int quality, int xSize, int ySize, bool lowEffort)
        {
            int size = xSize * ySize;
            int iterMax = GetMaxItersForQuality(quality);
@ -63,20 +66,21 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

            if (size <= 2)
            {
-                this.OffsetLength[0] = 0;
+                this.OffsetLength.GetSpan()[0] = 0;
                return;
            }

-            using IMemoryOwner<int> hashToFirstIndexBuffer = memoryAllocator.Allocate<int>(HashSize);
+            using IMemoryOwner<int> hashToFirstIndexBuffer = this.memoryAllocator.Allocate<int>(HashSize);
+            using IMemoryOwner<int> chainBuffer = this.memoryAllocator.Allocate<int>(size, AllocationOptions.Clean);
            Span<int> hashToFirstIndex = hashToFirstIndexBuffer.GetSpan();
+            Span<int> chain = chainBuffer.GetSpan();

            // Initialize hashToFirstIndex array to -1.
            hashToFirstIndex.Fill(-1);

-            int[] chain = new int[size];
-
            // Fill the chain linking pixels with the same hash.
            bool bgraComp = bgra.Length > 1 && bgra[0] == bgra[1];
+            Span<uint> tmp = stackalloc uint[2];
            for (pos = 0; pos < size - 2;)
            {
                uint hashCode;
@ -85,7 +89,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                {
                    // Consecutive pixels with the same color will share the same hash.
                    // We therefore use a different hash: the color and its repetition length.
-                    uint[] tmp = new uint[2];
+                    tmp.Clear();
                    uint len = 1;
                    tmp[0] = bgra[pos];

@ -134,7 +138,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            // Find the best match interval at each pixel, defined by an offset to the
            // pixel and a length. The right-most pixel cannot match anything to the right
            // (hence a best length of 0) and the left-most pixel nothing to the left (hence an offset of 0).
-            this.OffsetLength[0] = this.OffsetLength[size - 1] = 0;
+            Span<uint> offsetLength = this.OffsetLength.GetSpan();
+            offsetLength[0] = offsetLength[size - 1] = 0;
            for (int basePosition = size - 2; basePosition > 0;)
            {
                int maxLen = LosslessUtils.MaxFindCopyLength(size - 1 - basePosition);
@ -208,7 +213,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                uint maxBasePosition = (uint)basePosition;
                while (true)
                {
-                    this.OffsetLength[basePosition] = (bestDistance << BackwardReferenceEncoder.MaxLengthBits) | (uint)bestLength;
+                    offsetLength[basePosition] = (bestDistance << BackwardReferenceEncoder.MaxLengthBits) | (uint)bestLength;
                    --basePosition;

                    // Stop if we don't have a match or if we are out of bounds.
@ -242,10 +247,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        }

        [MethodImpl(InliningOptions.ShortMethod)]
-        public int FindLength(int basePosition) => (int)(this.OffsetLength[basePosition] & ((1U << BackwardReferenceEncoder.MaxLengthBits) - 1));
+        public int FindLength(int basePosition) => (int)(this.OffsetLength.GetSpan()[basePosition] & ((1U << BackwardReferenceEncoder.MaxLengthBits) - 1));

        [MethodImpl(InliningOptions.ShortMethod)]
-        public int FindOffset(int basePosition) => (int)(this.OffsetLength[basePosition] >> BackwardReferenceEncoder.MaxLengthBits);
+        public int FindOffset(int basePosition) => (int)(this.OffsetLength.GetSpan()[basePosition] >> BackwardReferenceEncoder.MaxLengthBits);

        /// <summary>
        /// Calculates the hash for a pixel pair.
@ -280,5 +285,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

            return maxWindowSize > WindowSize ? WindowSize : maxWindowSize;
        }
+
+        /// <inheritdoc />
+        public void Dispose() => this.OffsetLength.Dispose();
    }
 }
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs
@ -3,10 +3,16 @@

 using System;
 using System.Collections.Generic;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif

 namespace SixLabors.ImageSharp.Formats.Webp.Lossless
 {
-    internal class Vp8LHistogram : IDeepCloneable
+    internal sealed class Vp8LHistogram : IDeepCloneable
    {
        private const uint NonTrivialSym = 0xffffffff;

@ -505,11 +511,52 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            return cost;
        }

-        private static void AddVector(uint[] a, uint[] b, uint[] output, int size)
+        private static void AddVector(Span<uint> a, Span<uint> b, Span<uint> output, int count)
        {
-            for (int i = 0; i < size; i++)
+            DebugGuard.MustBeGreaterThanOrEqualTo(a.Length, count, nameof(a.Length));
+            DebugGuard.MustBeGreaterThanOrEqualTo(b.Length, count, nameof(b.Length));
+            DebugGuard.MustBeGreaterThanOrEqualTo(output.Length, count, nameof(output.Length));
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
+            {
+                ref uint aRef = ref MemoryMarshal.GetReference(a);
+                ref uint bRef = ref MemoryMarshal.GetReference(b);
+                ref uint outputRef = ref MemoryMarshal.GetReference(output);
+                int i;
+
+                for (i = 0; i + 32 <= count; i += 32)
+                {
+                    // Load values.
+                    Vector256<uint> a0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref aRef, i));
+                    Vector256<uint> a1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref aRef, i + 8));
+                    Vector256<uint> a2 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref aRef, i + 16));
+                    Vector256<uint> a3 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref aRef, i + 24));
+                    Vector256<uint> b0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref bRef, i));
+                    Vector256<uint> b1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref bRef, i + 8));
+                    Vector256<uint> b2 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref bRef, i + 16));
+                    Vector256<uint> b3 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref bRef, i + 24));
+
+                    // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
+                    // that's ok since the histogram values are less than 1<<28 (max picture count).
+                    Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref outputRef, i)) = Avx2.Add(a0, b0);
+                    Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref outputRef, i + 8)) = Avx2.Add(a1, b1);
+                    Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref outputRef, i + 16)) = Avx2.Add(a2, b2);
+                    Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref outputRef, i + 24)) = Avx2.Add(a3, b3);
+                }
+
+                for (; i < count; i++)
+                {
+                    output[i] = a[i] + b[i];
+                }
+            }
+            else
+#endif
            {
-                output[i] = a[i] + b[i];
+                for (int i = 0; i < count; i++)
+                {
+                    output[i] = a[i] + b[i];
+                }
            }
        }
    }
--- a/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs
@ -65,15 +65,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            FixedTableSize + 2704
        };

-        private static readonly byte[] CodeLengthCodeOrder = { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
        private static readonly int NumCodeLengthCodes = CodeLengthCodeOrder.Length;

-        private static readonly byte[] LiteralMap =
-        {
-            0, 1, 1, 1, 0
-        };
-
        /// <summary>
        /// Initializes a new instance of the <see cref="WebpLosslessDecoder"/> class.
        /// </summary>
@ -87,6 +80,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
            this.configuration = configuration;
        }

+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        private static ReadOnlySpan<byte> CodeLengthCodeOrder => new byte[] { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        private static ReadOnlySpan<byte> LiteralMap => new byte[] { 0, 1, 1, 1, 0 };
+
        /// <summary>
        /// Decodes the image from the stream using the bitreader.
        /// </summary>
@ -834,10 +833,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless

        private void BuildPackedTable(HTreeGroup hTreeGroup)
        {
-            for (uint code = 0; code < HuffmanUtils.HuffmanPackedTableSize; ++code)
+            for (uint code = 0; code < HuffmanUtils.HuffmanPackedTableSize; code++)
            {
                uint bits = code;
-                HuffmanCode huff = hTreeGroup.PackedTable[bits];
+                ref HuffmanCode huff = ref hTreeGroup.PackedTable[bits];
                HuffmanCode hCode = hTreeGroup.HTrees[HuffIndex.Green][bits];
                if (hCode.Value >= WebpConstants.NumLiteralCodes)
                {
@ -848,10 +847,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
                {
                    huff.BitsUsed = 0;
                    huff.Value = 0;
-                    bits >>= AccumulateHCode(hCode, 8, huff);
-                    bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Red][bits], 16, huff);
-                    bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Blue][bits], 0, huff);
-                    bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Alpha][bits], 24, huff);
+                    bits >>= AccumulateHCode(hCode, 8, ref huff);
+                    bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Red][bits], 16, ref huff);
+                    bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Blue][bits], 0, ref huff);
+                    bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Alpha][bits], 24, ref huff);
                }
            }
        }
@ -992,7 +991,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
        }

        [MethodImpl(InliningOptions.ShortMethod)]
-        private static int AccumulateHCode(HuffmanCode hCode, int shift, HuffmanCode huff)
+        private static int AccumulateHCode(HuffmanCode hCode, int shift, ref HuffmanCode huff)
        {
            huff.BitsUsed += hCode.BitsUsed;
            huff.Value |= hCode.Value << shift;
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs
@ -76,10 +76,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
            this.TmpVBuffer = memoryAllocator.Allocate<byte>((int)width);
            this.Pixels = memoryAllocator.Allocate<byte>((int)(width * height * 4));

+#if DEBUG
+            // Filling those buffers with 205, is only useful for debugging,
+            // so the default values are the same as the reference libwebp implementation.
            this.YuvBuffer.Memory.Span.Fill(205);
            this.CacheY.Memory.Span.Fill(205);
            this.CacheU.Memory.Span.Fill(205);
            this.CacheV.Memory.Span.Fill(205);
+#endif

            this.Vp8BitReaders = new Vp8BitReader[WebpConstants.MaxNumPartitions];
        }
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
@ -6,7 +6,7 @@ using System.Runtime.CompilerServices;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
-    internal class Vp8Histogram
+    internal sealed class Vp8Histogram
    {
        private readonly int[] scratch = new int[16];

@ -49,7 +49,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
            this.distribution.AsSpan().Clear();
            for (j = startBlock; j < endBlock; j++)
            {
-                this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output);
+                Vp8Encoding.FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output, this.scratch);

                // Convert coefficients to bin.
                for (int k = 0; k < 16; ++k)
@ -98,48 +98,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
            this.lastNonZero = lastNonZero;
        }

-        private void Vp8FTransform(Span<byte> src, Span<byte> reference, Span<short> output)
-        {
-            int i;
-            Span<int> tmp = this.scratch;
-            tmp.Clear();
-
-            for (i = 0; i < 4; i++)
-            {
-                int d0 = src[0] - reference[0];   // 9bit dynamic range ([-255,255])
-                int d1 = src[1] - reference[1];
-                int d2 = src[2] - reference[2];
-                int d3 = src[3] - reference[3];
-                int a0 = d0 + d3; // 10b [-510,510]
-                int a1 = d1 + d2;
-                int a2 = d1 - d2;
-                int a3 = d0 - d3;
-                tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160]
-                tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542]
-                tmp[2 + (i * 4)] = (a0 - a1) * 8;
-                tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9;
-
-                // Do not change the span in the last iteration.
-                if (i < 3)
-                {
-                    src = src.Slice(WebpConstants.Bps);
-                    reference = reference.Slice(WebpConstants.Bps);
-                }
-            }
-
-            for (i = 0; i < 4; i++)
-            {
-                int a0 = tmp[0 + i] + tmp[12 + i];  // 15b
-                int a1 = tmp[4 + i] + tmp[8 + i];
-                int a2 = tmp[4 + i] - tmp[8 + i];
-                int a3 = tmp[0 + i] - tmp[12 + i];
-                output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b
-                output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0));
-                output[8 + i] = (short)((a0 - a1 + 7) >> 4);
-                output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16);
-            }
-        }
-
        [MethodImpl(InliningOptions.ShortMethod)]
        private static int ClipMax(int v, int max) => v > max ? max : v;
    }
--- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
@ -692,16 +692,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
            int mbw = io.MbW;
            int uvw = (mbw + 1) / 2;
            int y = io.MbY;
+            byte[] uvBuffer = new byte[(14 * 32) + 15];

            if (y == 0)
            {
                // First line is special cased. We mirror the u/v samples at boundary.
-                this.UpSample(curY, null, curU, curV, curU, curV, dst, null, mbw);
+                YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst, default, mbw, uvBuffer);
            }
            else
            {
                // We can finish the left-over line from previous call.
-                this.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw);
+                YuvConversion.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw, uvBuffer);
                numLinesOut++;
            }

@ -714,7 +715,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                topV = curV;
                curU = curU.Slice(io.UvStride);
                curV = curV.Slice(io.UvStride);
-                this.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw);
+                YuvConversion.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw, uvBuffer);
                curY = curY.Slice(ioStride2);
                dst = dst.Slice(bufferStride2);
            }
@ -736,67 +737,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
                // Process the very last row of even-sized picture.
                if ((yEnd & 1) == 0)
                {
-                    this.UpSample(curY, null, curU, curV, curU, curV, dst.Slice(bufferStride), null, mbw);
+                    YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst.Slice(bufferStride), default, mbw, uvBuffer);
                }
            }

            return numLinesOut;
        }

-        private void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len)
-        {
-            int xStep = 3;
-            int lastPixelPair = (len - 1) >> 1;
-            uint tluv = YuvConversion.LoadUv(topU[0], topV[0]); // top-left sample
-            uint luv = YuvConversion.LoadUv(curU[0], curV[0]); // left-sample
-            uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
-            YuvConversion.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst);
-
-            if (bottomY != null)
-            {
-                uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
-                YuvConversion.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst);
-            }
-
-            for (int x = 1; x <= lastPixelPair; x++)
-            {
-                uint tuv = YuvConversion.LoadUv(topU[x], topV[x]); // top sample
-                uint uv = YuvConversion.LoadUv(curU[x], curV[x]); // sample
-
-                // Precompute invariant values associated with first and second diagonals.
-                uint avg = tluv + tuv + luv + uv + 0x00080008u;
-                uint diag12 = (avg + (2 * (tuv + luv))) >> 3;
-                uint diag03 = (avg + (2 * (tluv + uv))) >> 3;
-                uv0 = (diag12 + tluv) >> 1;
-                uint uv1 = (diag03 + tuv) >> 1;
-                int xMul2 = x * 2;
-                YuvConversion.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep));
-                YuvConversion.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep));
-
-                if (bottomY != null)
-                {
-                    uv0 = (diag03 + luv) >> 1;
-                    uv1 = (diag12 + uv) >> 1;
-                    YuvConversion.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep));
-                    YuvConversion.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep));
-                }
-
-                tluv = tuv;
-                luv = uv;
-            }
-
-            if ((len & 1) == 0)
-            {
-                uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
-                YuvConversion.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep));
-                if (bottomY != null)
-                {
-                    uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
-                    YuvConversion.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep));
-                }
-            }
-        }
-
        private void DoTransform(uint bits, Span<short> src, Span<byte> dst, Span<int> scratch)
        {
            switch (bits >> 30)
--- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
@ -4,6 +4,11 @@
 using System;
 using System.Buffers;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;

@ -18,6 +23,291 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy

        private const int YuvHalf = 1 << (YuvFix - 1);

+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector128<byte> One = Vector128.Create((byte)1);
+
+        // These constants are 14b fixed-point version of ITU-R BT.601 constants.
+        // R = (19077 * y             + 26149 * v - 14234) >> 6
+        // G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
+        // B = (19077 * y + 33050 * u             - 17685) >> 6
+        private static readonly Vector128<byte> K19077 = Vector128.Create((short)19077).AsByte();
+
+        private static readonly Vector128<byte> K26149 = Vector128.Create((short)26149).AsByte();
+
+        private static readonly Vector128<byte> K14234 = Vector128.Create((short)14234).AsByte();
+
+        // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
+        private static readonly Vector128<byte> K33050 = Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129);
+
+        private static readonly Vector128<byte> K17685 = Vector128.Create((short)17685).AsByte();
+
+        private static readonly Vector128<byte> K6419 = Vector128.Create((short)6419).AsByte();
+
+        private static readonly Vector128<byte> K13320 = Vector128.Create((short)13320).AsByte();
+
+        private static readonly Vector128<byte> K8708 = Vector128.Create((short)8708).AsByte();
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle0 = Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle1 = Vector128.Create(255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle2 = Vector128.Create(255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle3 = Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle4 = Vector128.Create(5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle5 = Vector128.Create(255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle6 = Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle7 = Vector128.Create(255, 5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle8 = Vector128.Create(10, 255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15);
+#endif
+
+        // UpSample from YUV to RGB.
+        // Given samples laid out in a square as:
+        //  [a b]
+        //  [c d]
+        // we interpolate u/v as:
+        //  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+        //  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+        public static void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse41.IsSupported)
+            {
+                UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
+            }
+            else
+#endif
+            {
+                UpSampleScalar(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len);
+            }
+        }
+
+        private static void UpSampleScalar(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len)
+        {
+            int xStep = 3;
+            int lastPixelPair = (len - 1) >> 1;
+            uint tluv = LoadUv(topU[0], topV[0]); // top-left sample
+            uint luv = LoadUv(curU[0], curV[0]); // left-sample
+            uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
+            YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst);
+
+            if (bottomY != default)
+            {
+                uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
+                YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst);
+            }
+
+            for (int x = 1; x <= lastPixelPair; x++)
+            {
+                uint tuv = LoadUv(topU[x], topV[x]); // top sample
+                uint uv = LoadUv(curU[x], curV[x]); // sample
+
+                // Precompute invariant values associated with first and second diagonals.
+                uint avg = tluv + tuv + luv + uv + 0x00080008u;
+                uint diag12 = (avg + (2 * (tuv + luv))) >> 3;
+                uint diag03 = (avg + (2 * (tluv + uv))) >> 3;
+                uv0 = (diag12 + tluv) >> 1;
+                uint uv1 = (diag03 + tuv) >> 1;
+                int xMul2 = x * 2;
+                YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep));
+                YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep));
+
+                if (bottomY != default)
+                {
+                    uv0 = (diag03 + luv) >> 1;
+                    uv1 = (diag12 + uv) >> 1;
+                    YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep));
+                    YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep));
+                }
+
+                tluv = tuv;
+                luv = uv;
+            }
+
+            if ((len & 1) == 0)
+            {
+                uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
+                YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep));
+                if (bottomY != default)
+                {
+                    uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
+                    YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep));
+                }
+            }
+        }
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        // We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
+        // u = (9*a + 3*b + 3*c + d + 8) / 16
+        //   = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
+        //   = (a + m + 1) / 2
+        // where m = (a + 3*b + 3*c + d) / 8
+        //         = ((a + b + c + d) / 2 + b + c) / 4
+        //
+        // Let's say  k = (a + b + c + d) / 4.
+        // We can compute k as
+        // k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
+        // where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
+        //
+        // Then m can be written as
+        // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
+        private static void UpSampleSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
+        {
+            const int xStep = 3;
+            Array.Clear(uvBuffer, 0, uvBuffer.Length);
+            Span<byte> ru = uvBuffer.AsSpan(15);
+            Span<byte> rv = ru.Slice(32);
+
+            // Treat the first pixel in regular way.
+            int uDiag = ((topU[0] + curU[0]) >> 1) + 1;
+            int vDiag = ((topV[0] + curV[0]) >> 1) + 1;
+            int u0t = (topU[0] + uDiag) >> 1;
+            int v0t = (topV[0] + vDiag) >> 1;
+            YuvToBgr(topY[0], u0t, v0t, topDst);
+            if (bottomY != default)
+            {
+                int u0b = (curU[0] + uDiag) >> 1;
+                int v0b = (curV[0] + vDiag) >> 1;
+                YuvToBgr(bottomY[0], u0b, v0b, bottomDst);
+            }
+
+            // For UpSample32Pixels, 17 u/v values must be read-able for each block.
+            int pos;
+            int uvPos;
+            ref byte topURef = ref MemoryMarshal.GetReference(topU);
+            ref byte topVRef = ref MemoryMarshal.GetReference(topV);
+            ref byte curURef = ref MemoryMarshal.GetReference(curU);
+            ref byte curVRef = ref MemoryMarshal.GetReference(curV);
+            if (bottomY != null)
+            {
+                for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
+                {
+                    UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru);
+                    UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv);
+                    ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
+                }
+            }
+            else
+            {
+                for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
+                {
+                    UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru);
+                    UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv);
+                    ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep);
+                }
+            }
+
+            // Process last block.
+            if (len > 1)
+            {
+                int leftOver = ((len + 1) >> 1) - (pos >> 1);
+                Span<byte> tmpTopDst = ru.Slice(4 * 32);
+                Span<byte> tmpBottomDst = tmpTopDst.Slice(4 * 32);
+                Span<byte> tmpTop = tmpBottomDst.Slice(4 * 32);
+                Span<byte> tmpBottom = (bottomY == null) ? null : tmpTop.Slice(32);
+                UpSampleLastBlock(topU.Slice(uvPos), curU.Slice(uvPos), leftOver, ru);
+                UpSampleLastBlock(topV.Slice(uvPos), curV.Slice(uvPos), leftOver, rv);
+
+                topY.Slice(pos, len - pos).CopyTo(tmpTop);
+                if (bottomY != default)
+                {
+                    bottomY.Slice(pos, len - pos).CopyTo(tmpBottom);
+                    ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
+                }
+                else
+                {
+                    ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep);
+                }
+
+                tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep));
+                if (bottomY != default)
+                {
+                    tmpBottomDst.Slice(0, (len - pos) * xStep).CopyTo(bottomDst.Slice(pos * xStep));
+                }
+            }
+        }
+
+        // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
+        private static void UpSample32Pixels(ref byte r1, ref byte r2, Span<byte> output)
+        {
+            // Load inputs.
+            Vector128<byte> a = Unsafe.As<byte, Vector128<byte>>(ref r1);
+            Vector128<byte> b = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref r1, 1));
+            Vector128<byte> c = Unsafe.As<byte, Vector128<byte>>(ref r2);
+            Vector128<byte> d = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref r2, 1));
+
+            Vector128<byte> s = Sse2.Average(a, d); // s = (a + d + 1) / 2
+            Vector128<byte> t = Sse2.Average(b, c); // t = (b + c + 1) / 2
+            Vector128<byte> st = Sse2.Xor(s, t); // st = s^t
+
+            Vector128<byte> ad = Sse2.Xor(a, d); // ad = a^d
+            Vector128<byte> bc = Sse2.Xor(b, c); // bc = b^c
+
+            Vector128<byte> t1 = Sse2.Or(ad, bc); // (a^d) | (b^c)
+            Vector128<byte> t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t)
+            Vector128<byte> t3 = Sse2.And(t2, One); // (a^d) | (b^c) | (s^t) & 1
+            Vector128<byte> t4 = Sse2.Average(s, t);
+            Vector128<byte> k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4
+
+            Vector128<byte> diag1 = GetM(k, st, bc, t);
+            Vector128<byte> diag2 = GetM(k, st, ad, s);
+
+            // Pack the alternate pixels.
+            PackAndStore(a, b, diag1, diag2, output); // store top.
+            PackAndStore(c, d, diag2, diag1, output.Slice(2 * 32));
+        }
+
+        private static void UpSampleLastBlock(Span<byte> tb, Span<byte> bb, int numPixels, Span<byte> output)
+        {
+            Span<byte> r1 = stackalloc byte[17];
+            Span<byte> r2 = stackalloc byte[17];
+            tb.Slice(0, numPixels).CopyTo(r1);
+            bb.Slice(0, numPixels).CopyTo(r2);
+
+            // Replicate last byte.
+            int length = 17 - numPixels;
+            if (length > 0)
+            {
+                r1.Slice(numPixels, length).Fill(r1[numPixels - 1]);
+                r2.Slice(numPixels, length).Fill(r2[numPixels - 1]);
+            }
+
+            ref byte r1Ref = ref MemoryMarshal.GetReference(r1);
+            ref byte r2Ref = ref MemoryMarshal.GetReference(r2);
+            UpSample32Pixels(ref r1Ref, ref r2Ref, output);
+        }
+
+        // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
+        private static Vector128<byte> GetM(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
+        {
+            Vector128<byte> tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2
+            Vector128<byte> tmp1 = Sse2.And(ij, st); // (ij) & (s^t)
+            Vector128<byte> tmp2 = Sse2.Xor(k, input); // (k^in)
+            Vector128<byte> tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in)
+            Vector128<byte> tmp4 = Sse2.And(tmp3, One); // & 1 -> lsb_correction
+
+            return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction
+        }
+
+        private static void PackAndStore(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
+        {
+            Vector128<byte> ta = Sse2.Average(a, da); // (9a + 3b + 3c +  d + 8) / 16
+            Vector128<byte> tb = Sse2.Average(b, db); // (3a + 9b +  c + 3d + 8) / 16
+            Vector128<byte> t1 = Sse2.UnpackLow(ta, tb);
+            Vector128<byte> t2 = Sse2.UnpackHigh(ta, tb);
+
+            ref byte output0Ref = ref MemoryMarshal.GetReference(output);
+            ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16);
+            Unsafe.As<byte, Vector128<byte>>(ref output0Ref) = t1;
+            Unsafe.As<byte, Vector128<byte>>(ref output1Ref) = t2;
+        }
+#endif
+
        /// <summary>
        /// Converts the RGB values of the image to YUV.
        /// </summary>
@ -312,6 +602,175 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
            bgr[0] = (byte)YuvToB(y, u);
        }

+#if SUPPORTS_RUNTIME_INTRINSICS
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static void ConvertYuvToBgrSse41(Span<byte> topY, Span<byte> topDst, Span<byte> ru, Span<byte> rv, int curX, int step) => YuvToBgrSse41(topY.Slice(curX), ru, rv, topDst.Slice(curX * step));
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static void ConvertYuvToBgrWithBottomYSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
+        {
+            YuvToBgrSse41(topY.Slice(curX), ru, rv, topDst.Slice(curX * step));
+            YuvToBgrSse41(bottomY.Slice(curX), ru.Slice(64), rv.Slice(64), bottomDst.Slice(curX * step));
+        }
+
+        private static void YuvToBgrSse41(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
+        {
+            ref byte yRef = ref MemoryMarshal.GetReference(y);
+            ref byte uRef = ref MemoryMarshal.GetReference(u);
+            ref byte vRef = ref MemoryMarshal.GetReference(v);
+            ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
+            ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
+            ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
+            ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);
+
+            // Cast to 8b and store as BBBBGGGGRRRR.
+            Vector128<byte> bgr0 = Sse2.PackUnsignedSaturate(b0, b1);
+            Vector128<byte> bgr1 = Sse2.PackUnsignedSaturate(b2, b3);
+            Vector128<byte> bgr2 = Sse2.PackUnsignedSaturate(g0, g1);
+            Vector128<byte> bgr3 = Sse2.PackUnsignedSaturate(g2, g3);
+            Vector128<byte> bgr4 = Sse2.PackUnsignedSaturate(r0, r1);
+            Vector128<byte> bgr5 = Sse2.PackUnsignedSaturate(r2, r3);
+
+            // Pack as BGRBGRBGRBGR.
+            PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
+        }
+
+        // Pack the planar buffers
+        // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+        // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
+        private static void PlanarTo24bSse41(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
+        {
+            // The input is 6 registers of sixteen 8b but for the sake of explanation,
+            // let's take 6 registers of four 8b values.
+            // To pack, we will keep taking one every two 8b integer and move it
+            // around as follows:
+            // Input:
+            //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
+            // Split the 6 registers in two sets of 3 registers: the first set as the even
+            // 8b bytes, the second the odd ones:
+            //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
+            // Repeat the same permutations twice more:
+            //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
+            //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
+
+            // Process R.
+            ChannelMixing(
+                input0,
+                input1,
+                PlanarTo24Shuffle0,
+                PlanarTo24Shuffle1,
+                PlanarTo24Shuffle2,
+                out Vector128<byte> r0,
+                out Vector128<byte> r1,
+                out Vector128<byte> r2,
+                out Vector128<byte> r3,
+                out Vector128<byte> r4,
+                out Vector128<byte> r5);
+
+            // Process G.
+            // Same as before, just shifted to the left by one and including the right padding.
+            ChannelMixing(
+                input2,
+                input3,
+                PlanarTo24Shuffle3,
+                PlanarTo24Shuffle4,
+                PlanarTo24Shuffle5,
+                out Vector128<byte> g0,
+                out Vector128<byte> g1,
+                out Vector128<byte> g2,
+                out Vector128<byte> g3,
+                out Vector128<byte> g4,
+                out Vector128<byte> g5);
+
+            // Process B.
+            ChannelMixing(
+                input4,
+                input5,
+                PlanarTo24Shuffle6,
+                PlanarTo24Shuffle7,
+                PlanarTo24Shuffle8,
+                out Vector128<byte> b0,
+                out Vector128<byte> b1,
+                out Vector128<byte> b2,
+                out Vector128<byte> b3,
+                out Vector128<byte> b4,
+                out Vector128<byte> b5);
+
+            // OR the different channels.
+            Vector128<byte> rg0 = Sse2.Or(r0, g0);
+            Vector128<byte> rg1 = Sse2.Or(r1, g1);
+            Vector128<byte> rg2 = Sse2.Or(r2, g2);
+            Vector128<byte> rg3 = Sse2.Or(r3, g3);
+            Vector128<byte> rg4 = Sse2.Or(r4, g4);
+            Vector128<byte> rg5 = Sse2.Or(r5, g5);
+
+            ref byte outputRef = ref MemoryMarshal.GetReference(rgb);
+            Unsafe.As<byte, Vector128<byte>>(ref outputRef) = Sse2.Or(rg0, b0);
+            Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1);
+            Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2);
+            Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3);
+            Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4);
+            Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5);
+        }
+
+        // Shuffles the input buffer as A0 0 0 A1 0 0 A2
+        private static void ChannelMixing(
+            Vector128<byte> input0,
+            Vector128<byte> input1,
+            Vector128<byte> shuffle0,
+            Vector128<byte> shuffle1,
+            Vector128<byte> shuffle2,
+            out Vector128<byte> output0,
+            out Vector128<byte> output1,
+            out Vector128<byte> output2,
+            out Vector128<byte> output3,
+            out Vector128<byte> output4,
+            out Vector128<byte> output5)
+        {
+            output0 = Ssse3.Shuffle(input0, shuffle0);
+            output1 = Ssse3.Shuffle(input0, shuffle1);
+            output2 = Ssse3.Shuffle(input0, shuffle2);
+            output3 = Ssse3.Shuffle(input1, shuffle0);
+            output4 = Ssse3.Shuffle(input1, shuffle1);
+            output5 = Ssse3.Shuffle(input1, shuffle2);
+        }
+
+        // Convert 32 samples of YUV444 to B/G/R
+        private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
+        {
+            // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
+            Vector128<byte> y0 = Unsafe.As<byte, Vector128<byte>>(ref y);
+            Vector128<byte> u0 = Unsafe.As<byte, Vector128<byte>>(ref u);
+            Vector128<byte> v0 = Unsafe.As<byte, Vector128<byte>>(ref v);
+            y0 = Sse2.UnpackLow(Vector128<byte>.Zero, y0);
+            u0 = Sse2.UnpackLow(Vector128<byte>.Zero, u0);
+            v0 = Sse2.UnpackLow(Vector128<byte>.Zero, v0);
+
+            Vector128<ushort> y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16());
+            Vector128<ushort> r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16());
+            Vector128<ushort> g0 = Sse2.MultiplyHigh(u0.AsUInt16(), K6419.AsUInt16());
+            Vector128<ushort> g1 = Sse2.MultiplyHigh(v0.AsUInt16(), K13320.AsUInt16());
+
+            Vector128<ushort> r1 = Sse2.Subtract(y1.AsUInt16(), K14234.AsUInt16());
+            Vector128<ushort> r2 = Sse2.Add(r1, r0);
+
+            Vector128<ushort> g2 = Sse2.Add(y1.AsUInt16(), K8708.AsUInt16());
+            Vector128<ushort> g3 = Sse2.Add(g0, g1);
+            Vector128<ushort> g4 = Sse2.Subtract(g2, g3);
+
+            Vector128<ushort> b0 = Sse2.MultiplyHigh(u0.AsUInt16(), K33050.AsUInt16());
+            Vector128<ushort> b1 = Sse2.AddSaturate(b0, y1);
+            Vector128<ushort> b2 = Sse2.SubtractSaturate(b1, K17685.AsUInt16());
+
+            // Use logical shift for B2, which can be larger than 32767.
+            r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
+            g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
+            b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
+        }
+
+#endif
+
        [MethodImpl(InliningOptions.ShortMethod)]
        public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685);

--- a/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
+++ b/src/ImageSharp/Formats/Webp/WebpLookupTables.cs
@ -239,7 +239,8 @@ namespace SixLabors.ImageSharp.Formats.Webp
            }
        };

-        public static readonly byte[] Norm =
+        // This uses C#'s compiler optimization to refer to assembly's static data directly.
+        public static ReadOnlySpan<byte> Norm => new byte[]
        {
            // renorm_sizes[i] = 8 - log2(i)
            7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
--- a/src/ImageSharp/Processing/Extensions/Normalization/HistogramEqualizationExtensions.cs
+++ b/src/ImageSharp/Processing/Extensions/Normalization/HistogramEqualizationExtensions.cs
@ -16,7 +16,7 @@ namespace SixLabors.ImageSharp.Processing
        /// <param name="source">The image this method extends.</param>
        /// <returns>The <see cref="IImageProcessingContext"/> to allow chaining of operations.</returns>
        public static IImageProcessingContext HistogramEqualization(this IImageProcessingContext source) =>
-            HistogramEqualization(source, HistogramEqualizationOptions.Default);
+            HistogramEqualization(source, new HistogramEqualizationOptions());

        /// <summary>
        /// Equalizes the histogram of an image to increases the contrast.
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
@ -396,6 +396,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution

                    PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);

+                    Numerics.Premultiply(sourceBuffer);
+
                    ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
                    ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceBase, sourceBuffer.Length);
                    ref Vector4 targetStart = ref targetBase;
--- a/src/ImageSharp/Processing/Processors/Normalization/HistogramEqualizationOptions.cs
+++ b/src/ImageSharp/Processing/Processors/Normalization/HistogramEqualizationOptions.cs
@ -8,11 +8,6 @@ namespace SixLabors.ImageSharp.Processing.Processors.Normalization
    /// </summary>
    public class HistogramEqualizationOptions
    {
-        /// <summary>
-        /// Gets the default <see cref="HistogramEqualizationOptions"/> instance.
-        /// </summary>
-        public static HistogramEqualizationOptions Default { get; } = new HistogramEqualizationOptions();
-
        /// <summary>
        /// Gets or sets the histogram equalization method to use. Defaults to global histogram equalization.
        /// </summary>
--- a/src/ImageSharp/Processing/Processors/Quantization/EuclideanPixelMap{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Quantization/EuclideanPixelMap{TPixel}.cs
@ -22,7 +22,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Quantization
        where TPixel : unmanaged, IPixel<TPixel>
    {
        private Rgba32[] rgbaPalette;
-        private readonly ColorDistanceCache cache;
+
+        // Do not make this readonly! Struct value would be always copied on non-readonly method calls.
+        private ColorDistanceCache cache;
        private readonly Configuration configuration;

        /// <summary>
--- a/tests/ImageSharp.Benchmarks/Codecs/Bmp/DecodeBmp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Bmp/DecodeBmp.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Bmp/EncodeBmp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Bmp/EncodeBmp.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Bmp/EncodeBmpMultiple.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Bmp/EncodeBmpMultiple.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Gif/DecodeGif.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Gif/DecodeGif.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Gif/EncodeGif.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Gif/EncodeGif.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Gif/EncodeGifMultiple.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Gif/EncodeGifMultiple.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/CmykColorConversion.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/CmykColorConversion.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/ColorConversionBenchmark.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/ColorConversionBenchmark.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/GrayscaleColorConversion.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/GrayscaleColorConversion.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/RgbColorConversion.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/RgbColorConversion.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/YCbCrColorConversion.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/YCbCrColorConversion.cs
--- a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
+++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/YccKColorConverter.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/ColorConversion/YccKColorConverter.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs
@ -0,0 +1,82 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System.IO;
+using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.Formats.Jpeg;
+using SixLabors.ImageSharp.Tests;
+
+namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
+{
+    public class DecodeJpeg
+    {
+        private JpegDecoder decoder;
+
+        private MemoryStream preloadedImageStream;
+
+        private void GenericSetup(string imageSubpath)
+        {
+            this.decoder = new JpegDecoder();
+            byte[] bytes = File.ReadAllBytes(Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, imageSubpath));
+            this.preloadedImageStream = new MemoryStream(bytes);
+        }
+
+        private void GenericBechmark()
+        {
+            this.preloadedImageStream.Position = 0;
+            using Image img = this.decoder.Decode(Configuration.Default, this.preloadedImageStream);
+        }
+
+        [GlobalSetup(Target = nameof(JpegBaselineInterleaved444))]
+        public void SetupBaselineInterleaved444() =>
+            this.GenericSetup(TestImages.Jpeg.Baseline.Winter444_Interleaved);
+
+        [GlobalSetup(Target = nameof(JpegBaselineInterleaved420))]
+        public void SetupBaselineInterleaved420() =>
+            this.GenericSetup(TestImages.Jpeg.Baseline.Hiyamugi);
+
+        [GlobalSetup(Target = nameof(JpegBaseline400))]
+        public void SetupBaselineSingleComponent() =>
+            this.GenericSetup(TestImages.Jpeg.Baseline.Jpeg400);
+
+        [GlobalSetup(Target = nameof(JpegProgressiveNonInterleaved420))]
+        public void SetupProgressiveNoninterleaved420() =>
+            this.GenericSetup(TestImages.Jpeg.Progressive.Winter420_NonInterleaved);
+
+        [GlobalCleanup]
+        public void Cleanup()
+        {
+            this.preloadedImageStream.Dispose();
+            this.preloadedImageStream = null;
+        }
+
+        [Benchmark(Description = "Baseline 4:4:4 Interleaved")]
+        public void JpegBaselineInterleaved444() => this.GenericBechmark();
+
+        [Benchmark(Description = "Baseline 4:2:0 Interleaved")]
+        public void JpegBaselineInterleaved420() => this.GenericBechmark();
+
+        [Benchmark(Description = "Baseline 4:0:0 (grayscale)")]
+        public void JpegBaseline400() => this.GenericBechmark();
+
+        [Benchmark(Description = "Progressive 4:2:0 Non-Interleaved")]
+        public void JpegProgressiveNonInterleaved420() => this.GenericBechmark();
+    }
+}
+
+
+/*
+BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1348 (20H2/October2020Update)
+Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
+.NET SDK=6.0.100-preview.3.21202.5
+  [Host]     : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+  DefaultJob : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+
+
+|                              Method |      Mean |     Error |    StdDev |
+|------------------------------------ |----------:|----------:|----------:|
+|        'Baseline 4:4:4 Interleaved' | 11.127 ms | 0.0659 ms | 0.0550 ms |
+|        'Baseline 4:2:0 Interleaved' |  8.458 ms | 0.0289 ms | 0.0256 ms |
+|        'Baseline 4:0:0 (grayscale)' |  1.550 ms | 0.0050 ms | 0.0044 ms |
+| 'Progressive 4:2:0 Non-Interleaved' | 13.220 ms | 0.0449 ms | 0.0398 ms |
+*/
--- a/tests/ImageSharp.Benchmarks/Codecs/Png/DecodeFilteredPng.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Png/DecodeFilteredPng.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Png/DecodePng.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Png/DecodePng.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Png/EncodeIndexedPng.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Png/EncodeIndexedPng.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Png/EncodePng.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Png/EncodePng.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Tga/DecodeTga.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Tga/DecodeTga.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Tga/EncodeTga.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Tga/EncodeTga.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Tiff/DecodeTiff.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Tiff/DecodeTiff.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Tiff/EncodeTiff.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Tiff/EncodeTiff.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Webp/DecodeWebp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Webp/DecodeWebp.cs
--- a/tests/ImageSharp.Benchmarks/Codecs/Webp/EncodeWebp.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Webp/EncodeWebp.cs
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@ -183,9 +183,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                Assert.Equal(expected, actual);
            }

+            // This method has only 2 implementations:
+            // 1. AVX
+            // 2. Scalar
            FeatureTestRunner.RunWithHwIntrinsicsFeature(
                RunTest,
-                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
+                HwIntrinsics.AllowAll | HwIntrinsics.DisableHWIntrinsic);
        }

        private static float[] Create8x8ColorCropTestData()
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
@ -276,5 +276,31 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                seed,
                HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
        }
+
+        [Fact]
+        public void TransposeInplace()
+        {
+            static void RunTest()
+            {
+                short[] expected = Create8x8ShortData();
+                ReferenceImplementations.Transpose8x8(expected);
+
+                var block8x8 = default(Block8x8);
+                block8x8.LoadFrom(Create8x8ShortData());
+
+                block8x8.TransposeInplace();
+
+                short[] actual = new short[64];
+                block8x8.CopyTo(actual);
+
+                Assert.Equal(expected, actual);
+            }
+
+            // This method has only 1 implementation:
+            // 1. Scalar
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                HwIntrinsics.DisableHWIntrinsic);
+        }
    }
 }
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@ -2,9 +2,6 @@
 // Licensed under the Apache License, Version 2.0.

 using System;
-#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics.X86;
-#endif
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
 using SixLabors.ImageSharp.Tests.TestUtilities;
@ -17,6 +14,20 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
    [Trait("Format", "Jpg")]
    public static class DCTTests
    {
+        private const int MaxAllowedValue = short.MaxValue;
+        private const int MinAllowedValue = short.MinValue;
+
+        internal static Block8x8F CreateBlockFromScalar(float value)
+        {
+            Block8x8F result = default;
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                result[i] = value;
+            }
+
+            return result;
+        }
+
        public class FastFloatingPoint : JpegFixture
        {
            public FastFloatingPoint(ITestOutputHelper output)
@ -24,130 +35,75 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            {
            }

-            // Reference tests
            [Theory]
            [InlineData(1)]
            [InlineData(2)]
            [InlineData(3)]
            public void LLM_TransformIDCT_CompareToNonOptimized(int seed)
            {
-                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+                float[] sourceArray = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);

                var srcBlock = Block8x8F.Load(sourceArray);

+                // reference
                Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock);

-                var temp = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);
-
-                this.CompareBlocks(expected, srcBlock, 1f);
-            }
-
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            [InlineData(3)]
-            public void LLM_TransformIDCT_CompareToAccurate(int seed)
-            {
-                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+                // testee
+                // Part of the IDCT calculations is fused into the quantization step
+                // We must multiply input block with adjusted no-quantization matrix
+                // before applying IDCT
+                // Dequantization using unit matrix - no values are upscaled
+                Block8x8F dequantMatrix = CreateBlockFromScalar(1);

-                var srcBlock = Block8x8F.Load(sourceArray);
+                // This step is needed to apply adjusting multipliers to the input block
+                FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);

-                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock);
+                // IDCT implementation tranforms blocks after transposition
+                srcBlock.TransposeInplace();
+                srcBlock.MultiplyInPlace(ref dequantMatrix);

-                var temp = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);
+                // IDCT calculation
+                FastFloatingPointDCT.TransformIDCT(ref srcBlock);

                this.CompareBlocks(expected, srcBlock, 1f);
            }

-            // Inverse transform
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void IDCT8x4_LeftPart(int seed)
-            {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
-
-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
-
-                // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
-
-                // testee
-                FastFloatingPointDCT.IDCT8x4_LeftPart(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
            [Theory]
            [InlineData(1)]
            [InlineData(2)]
-            public void IDCT8x4_RightPart(int seed)
+            [InlineData(3)]
+            public void LLM_TransformIDCT_CompareToAccurate(int seed)
            {
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                var srcBlock = default(Block8x8F);
-                srcBlock.LoadFrom(src);
+                float[] sourceArray = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);

-                var destBlock = default(Block8x8F);
-
-                var expectedDest = new float[64];
+                var srcBlock = Block8x8F.Load(sourceArray);

                // reference
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
+                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock);

                // testee
-                FastFloatingPointDCT.IDCT8x4_RightPart(ref srcBlock, ref destBlock);
-
-                var actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-            }
-
-            [Theory]
-            [InlineData(1)]
-            [InlineData(2)]
-            public void IDCT8x8_Avx(int seed)
-            {
-#if SUPPORTS_RUNTIME_INTRINSICS
-                if (!Avx.IsSupported)
-                {
-                    this.Output.WriteLine("No AVX present, skipping test!");
-                    return;
-                }
-
-                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
-                Block8x8F srcBlock = default;
-                srcBlock.LoadFrom(src);
+                // Part of the IDCT calculations is fused into the quantization step
+                // We must multiply input block with adjusted no-quantization matrix
+                // before applying IDCT
+                // Dequantization using unit matrix - no values are upscaled
+                Block8x8F dequantMatrix = CreateBlockFromScalar(1);

-                Block8x8F destBlock = default;
+                // This step is needed to apply adjusting multipliers to the input block
+                FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);

-                float[] expectedDest = new float[64];
+                // IDCT implementation tranforms blocks after transposition
+                srcBlock.TransposeInplace();
+                srcBlock.MultiplyInPlace(ref dequantMatrix);

-                // reference, left part
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
+                // IDCT calculation
+                FastFloatingPointDCT.TransformIDCT(ref srcBlock);

-                // reference, right part
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
-
-                // testee, whole 8x8
-                FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock);
-
-                float[] actualDest = new float[64];
-                destBlock.ScaledCopyTo(actualDest);
-
-                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-#endif
+                this.CompareBlocks(expected, srcBlock, 1f);
            }

+            // Inverse transform
+            // This test covers entire IDCT conversion chain
+            // This test checks all hardware implementations
            [Theory]
            [InlineData(1)]
            [InlineData(2)]
@ -157,41 +113,53 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                {
                    int seed = FeatureTestRunner.Deserialize<int>(serialized);

-                    Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                    Span<float> src = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
                    var srcBlock = default(Block8x8F);
                    srcBlock.LoadFrom(src);

-                    var expectedDest = new float[64];
-                    var temp1 = new float[64];
-                    var temp2 = default(Block8x8F);
+                    float[] expectedDest = new float[64];
+                    float[] temp = new float[64];

                    // reference
-                    ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1);
+                    ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp);

                    // testee
-                    FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2);
+                    // Part of the IDCT calculations is fused into the quantization step
+                    // We must multiply input block with adjusted no-quantization matrix
+                    // before applying IDCT
+                    Block8x8F dequantMatrix = CreateBlockFromScalar(1);
+
+                    // Dequantization using unit matrix - no values are upscaled
+                    // as quant matrix is all 1's
+                    // This step is needed to apply adjusting multipliers to the input block
+                    FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
+                    srcBlock.MultiplyInPlace(ref dequantMatrix);
+
+                    // IDCT implementation tranforms blocks after transposition
+                    srcBlock.TransposeInplace();

-                    var actualDest = new float[64];
-                    srcBlock.ScaledCopyTo(actualDest);
+                    // IDCT calculation
+                    FastFloatingPointDCT.TransformIDCT(ref srcBlock);
+
+                    float[] actualDest = srcBlock.ToArray();

                    Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
                }

-                // 3 paths:
+                // 4 paths:
                // 1. AllowAll - call avx/fma implementation
-                // 2. DisableFMA - call avx implementation without fma acceleration
-                // 3. DisableAvx - call fallback code of Vector4 implementation
-                //
-                // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
+                // 2. DisableFMA - call avx without fma implementation
+                // 3. DisableAvx - call sse Vector4 implementation
+                // 4. DisableHWIntrinsic - call scalar fallback implementation
                FeatureTestRunner.RunWithHwIntrinsicsFeature(
                    RunTest,
                    seed,
-                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX);
+                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
            }

            // Forward transform
-            // This test covers entire FDCT conversions chain
-            // This test checks all implementations: intrinsic and scalar fallback
+            // This test covers entire FDCT conversion chain
+            // This test checks all hardware implementations
            [Theory]
            [InlineData(1)]
            [InlineData(2)]
@ -201,7 +169,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                {
                    int seed = FeatureTestRunner.Deserialize<int>(serialized);

-                    Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                    Span<float> src = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
                    var block = default(Block8x8F);
                    block.LoadFrom(src);

@ -212,23 +180,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);

                    // testee
-                    // Part of the FDCT calculations is fused into the quantization step
-                    // We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen
                    FastFloatingPointDCT.TransformFDCT(ref block);
-                    for (int i = 0; i < 64; i++)
-                    {
-                        block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i];
-                    }
+
+                    // Part of the IDCT calculations is fused into the quantization step
+                    // We must multiply input block with adjusted no-quantization matrix
+                    // after applying FDCT
+                    Block8x8F quantMatrix = CreateBlockFromScalar(1);
+                    FastFloatingPointDCT.AdjustToFDCT(ref quantMatrix);
+                    block.MultiplyInPlace(ref quantMatrix);

                    float[] actualDest = block.ToArray();

                    Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f));
                }

-                // 3 paths:
+                // 4 paths:
                // 1. AllowAll - call avx/fma implementation
-                // 2. DisableFMA - call avx implementation without fma acceleration
-                // 3. DisableAvx - call sse implementation
+                // 2. DisableFMA - call avx without fma implementation
+                // 3. DisableAvx - call sse Vector4 implementation
                // 4. DisableHWIntrinsic - call scalar fallback implementation
                FeatureTestRunner.RunWithHwIntrinsicsFeature(
                    RunTest,
--- a/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Images.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Images.cs
@ -20,6 +20,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                TestImages.Jpeg.Baseline.Jpeg420Small,
                TestImages.Jpeg.Issues.Fuzz.AccessViolationException922,
                TestImages.Jpeg.Baseline.Jpeg444,
+                TestImages.Jpeg.Baseline.Jpeg422,
                TestImages.Jpeg.Baseline.Bad.BadEOF,
                TestImages.Jpeg.Baseline.MultiScanBaselineCMYK,
                TestImages.Jpeg.Baseline.YcckSubsample1222,
@ -100,6 +101,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                [TestImages.Jpeg.Baseline.Bad.BadEOF] = 0.38f / 100,
                [TestImages.Jpeg.Baseline.Bad.BadRST] = 0.0589f / 100,

+                [TestImages.Jpeg.Baseline.Jpeg422] = 0.0013f / 100,
                [TestImages.Jpeg.Baseline.Testorig420] = 0.38f / 100,
                [TestImages.Jpeg.Baseline.Jpeg420Small] = 0.287f / 100,
                [TestImages.Jpeg.Baseline.Turtle420] = 1.0f / 100,
--- a/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Metadata.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.Metadata.cs
@ -56,7 +56,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            { TestImages.Jpeg.Progressive.Fb, 75 },
            { TestImages.Jpeg.Issues.IncorrectQuality845, 98 },
            { TestImages.Jpeg.Baseline.ForestBridgeDifferentComponentsQuality, 89 },
-            { TestImages.Jpeg.Progressive.Winter, 80 }
+            { TestImages.Jpeg.Progressive.Winter420_NonInterleaved, 80 }
        };

        [Theory]
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs
@ -172,7 +172,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils

            bool failed = false;

-            for (int i = 0; i < 64; i++)
+            for (int i = 0; i < Block8x8F.Size; i++)
            {
                float expected = a[i];
                float actual = b[i];
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
@ -48,6 +48,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils

            public short MaxVal { get; private set; } = short.MinValue;

+            internal void MakeBlock(Block8x8 block, int y, int x)
+            {
+                block.TransposeInplace();
+                this.MakeBlock(block.ToArray(), y, x);
+            }
+
            internal void MakeBlock(short[] data, int y, int x)
            {
                this.MinVal = Math.Min(this.MinVal, data.Min());
@ -66,11 +72,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
                    Span<Block8x8> blockRow = data.GetRowSpan(y - startIndex);
                    for (int x = 0; x < this.WidthInBlocks; x++)
                    {
-                        short[] block = blockRow[x].ToArray();
-
-                        // x coordinate stays the same - we load entire stride
-                        // y coordinate is tricky as we load single stride to full buffer - offset is needed
-                        this.MakeBlock(block, y, x);
+                        this.MakeBlock(blockRow[x], y, x);
                    }
                }
            }
@ -83,8 +85,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
                    Span<Block8x8> blockRow = data.GetRowSpan(y);
                    for (int x = 0; x < this.WidthInBlocks; x++)
                    {
-                        short[] block = blockRow[x].ToArray();
-                        this.MakeBlock(block, y, x);
+                        this.MakeBlock(blockRow[x], y, x);
                    }
                }
            }
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
@ -40,6 +40,23 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
            }
        }

+        /// <summary>
+        /// Transpose 8x8 block stored linearly in a <see cref="Span{T}"/> (inplace)
+        /// </summary>
+        internal static void Transpose8x8(Span<short> data)
+        {
+            for (int i = 1; i < 8; i++)
+            {
+                int i8 = i * 8;
+                for (int j = 0; j < i; j++)
+                {
+                    short tmp = data[i8 + j];
+                    data[i8 + j] = data[(j * 8) + i];
+                    data[(j * 8) + i] = tmp;
+                }
+            }
+        }
+
        /// <summary>
        /// Transpose 8x8 block stored linearly in a  <see cref="Span{T}"/>
        /// </summary>
--- a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
@ -1,6 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

+using System;
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using Xunit;

@ -9,8 +10,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
    [Trait("Format", "Jpg")]
    public class ZigZagTests
    {
-        [Fact]
-        public void ZigZagCanHandleAllPossibleCoefficients()
+        private static void CanHandleAllPossibleCoefficients(ReadOnlySpan<byte> order)
        {
            // Mimic the behaviour of the huffman scan decoder using all possible byte values
            short[] block = new short[64];
@ -26,7 +26,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                    if (s != 0)
                    {
                        i += r;
-                        block[ZigZag.ZigZagOrder[i++]] = (short)s;
+                        block[order[i++]] = (short)s;
                    }
                    else
                    {
@ -40,5 +40,13 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                }
            }
        }
+
+        [Fact]
+        public static void ZigZagCanHandleAllPossibleCoefficients() =>
+            CanHandleAllPossibleCoefficients(ZigZag.ZigZagOrder);
+
+        [Fact]
+        public static void TrasposingZigZagCanHandleAllPossibleCoefficients() =>
+            CanHandleAllPossibleCoefficients(ZigZag.TransposingOrder);
    }
 }
--- a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs
@ -5,7 +5,7 @@ using SixLabors.ImageSharp.Formats.Webp.Lossless;
 using SixLabors.ImageSharp.Tests.TestUtilities;
 using Xunit;

-namespace SixLabors.ImageSharp.Tests.Formats.WebP
+namespace SixLabors.ImageSharp.Tests.Formats.Webp
 {
    [Trait("Format", "Webp")]
    public class ColorSpaceTransformUtilsTests
--- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
@ -10,6 +10,17 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
    [Trait("Format", "Webp")]
    public class LosslessUtilsTests
    {
+        private static void RunCombinedShannonEntropyTest()
+        {
+            int[] x = { 3, 5, 2, 5, 3, 1, 2, 2, 3, 3, 1, 2, 1, 2, 1, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 1, 0, 0, 2, 1, 1, 0, 3, 1, 2, 3, 2, 3 };
+            int[] y = { 11, 12, 8, 3, 4, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 2, 1, 1, 2, 4, 6, 4 };
+            float expected = 884.7585f;
+
+            float actual = LosslessUtils.CombinedShannonEntropy(x, y);
+
+            Assert.Equal(expected, actual, 5);
+        }
+
        private static void RunSubtractGreenTest()
        {
            uint[] pixelData =
@ -193,6 +204,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
            }
        }

+        [Fact]
+        public void CombinedShannonEntropy_Works() => RunCombinedShannonEntropyTest();
+
        [Fact]
        public void Predictor11_Works() => RunPredictor11Test();

@ -215,6 +229,13 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
        public void TransformColorInverse_Works() => RunTransformColorInverseTest();

 #if SUPPORTS_RUNTIME_INTRINSICS
+        
+        [Fact]
+        public void CombinedShannonEntropy_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCombinedShannonEntropyTest, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void CombinedShannonEntropy_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCombinedShannonEntropyTest, HwIntrinsics.DisableAVX2);
+
        [Fact]
        public void Predictor11_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.AllowAll);

@ -237,19 +258,19 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
        public void SubtractGreen_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.AllowAll);

        [Fact]
-        public void SubtractGreen_WithoutAvx_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX);
+        public void SubtractGreen_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX2);

        [Fact]
-        public void SubtractGreen_WithoutAvxOrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSSE3);
+        public void SubtractGreen_WithoutAvxOrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSSE3);

        [Fact]
        public void AddGreenToBlueAndRed_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.AllowAll);

        [Fact]
-        public void AddGreenToBlueAndRed_WithoutAvx_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX);
+        public void AddGreenToBlueAndRed_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX2);

        [Fact]
-        public void AddGreenToBlueAndRed_WithoutAvxOrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableSSSE3);
+        public void AddGreenToBlueAndRed_WithoutAVX2OrSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddGreenToBlueAndRedTest, HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableSSSE3);

        [Fact]
        public void TransformColor_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorTest, HwIntrinsics.AllowAll);
--- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
@ -6,7 +6,7 @@ using SixLabors.ImageSharp.Formats.Webp.Lossy;
 using SixLabors.ImageSharp.Tests.TestUtilities;
 using Xunit;

-namespace SixLabors.ImageSharp.Tests.Formats.WebP
+namespace SixLabors.ImageSharp.Tests.Formats.Webp
 {
    [Trait("Format", "Webp")]
    public class LossyUtilsTests
@ -38,7 +38,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
            int actual = LossyUtils.Vp8_Sse4X4(a, b);

            Assert.Equal(expected, actual);
-		}
+        }

        private static void RunMean16x4Test()
        {
--- a/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
@ -6,7 +6,7 @@ using SixLabors.ImageSharp.Formats.Webp.Lossy;
 using SixLabors.ImageSharp.Tests.TestUtilities;
 using Xunit;

-namespace SixLabors.ImageSharp.Tests.Formats.WebP
+namespace SixLabors.ImageSharp.Tests.Formats.Webp
 {
    [Trait("Format", "Webp")]
    public class QuantEncTests
--- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs
@ -6,7 +6,7 @@ using SixLabors.ImageSharp.Formats.Webp.Lossy;
 using SixLabors.ImageSharp.Tests.TestUtilities;
 using Xunit;

-namespace SixLabors.ImageSharp.Tests.Formats.WebP
+namespace SixLabors.ImageSharp.Tests.Formats.Webp
 {
    [Trait("Format", "Webp")]
    public class Vp8EncodingTests
--- a/tests/ImageSharp.Tests/Formats/WebP/Vp8LHistogramTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8LHistogramTests.cs
@ -0,0 +1,109 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Linq;
+using SixLabors.ImageSharp.Formats.Webp.Lossless;
+using SixLabors.ImageSharp.Tests.TestUtilities;
+using Xunit;
+
+namespace SixLabors.ImageSharp.Tests.Formats.WebP
+{
+    public class Vp8LHistogramTests
+    {
+        private static void RunAddVectorTest()
+        {
+            // arrange
+            uint[] pixelData =
+            {
+                4278191104, 4278191104, 4278191104, 4278191104, 4278191104, 4278191104, 4278191104, 4294577152,
+                4294707200, 4294707200, 4294707200, 4294707200, 4294837248, 4294837248, 4293926912, 4294316544,
+                4278191104, 4278191104, 4294837248, 4294837248, 4280287232, 4280350720, 4294447104, 4294707200,
+                4294838272, 4278516736, 4294837248, 4294837248, 4278516736, 4294707200, 4279298048, 4294837248,
+                4294837248, 4294837248, 4294837248, 4280287232, 4280287232, 4292670464, 4279633408, 4294838272,
+                4294837248, 4278516736, 4278516736, 4278516736, 4278516736, 4278516736, 4278778880, 4278193152,
+                4278191104, 4280287232, 4280287232, 4280287232, 4280287232, 4293971968, 4280612864, 4292802560,
+                4294837760, 4278516736, 4278516736, 4294837760, 4294707712, 4278516736, 4294837248, 4278193152,
+                4280287232, 4278984704, 4280287232, 4278243328, 4280287232, 4278244352, 4280287232, 4280025088,
+                4280025088, 4294837760, 4278192128, 4294838784, 4294837760, 4294707712, 4278778880, 4278324224,
+                4280287232, 4280287232, 4278202368, 4279115776, 4280287232, 4278243328, 4280287232, 4280287232,
+                4280025088, 4280287232, 4278192128, 4294838272, 4294838272, 4294837760, 4278190592, 4278778880,
+                4280875008, 4280287232, 4279896576, 4281075712, 4281075712, 4280287232, 4280287232, 4280287232,
+                4280287232, 4280287232, 4278190592, 4294709248, 4278516736, 4278516736, 4278584832, 4278909440,
+                4280287232, 4280287232, 4294367744, 4294621184, 4279115776, 4280287232, 4280287232, 4280351744,
+                4280287232, 4280287232, 4280287232, 4278513664, 4278516736, 4278716416, 4278584832, 4280291328,
+                4293062144, 4280287232, 4280287232, 4280287232, 4294456320, 4280291328, 4280287232, 4280287232,
+                4280287232, 4280287232, 4280287232, 4280287232, 4278513152, 4278716416, 4278584832, 4280291328,
+                4278198272, 4278198272, 4278589952, 4278198272, 4278198272, 4280287232, 4278765568, 4280287232,
+                4280287232, 4280287232, 4280287232, 4294712832, 4278513152, 4278716640, 4279300608, 4278584832,
+                4280156672, 4279373312, 4278589952, 4279373312, 4278328832, 4278328832, 4278328832, 4279634432,
+                4280287232, 4280287232, 4280287232, 4280287232, 4278457344, 4280483328, 4278584832, 4278385664,
+                4279634432, 4279373312, 4279634432, 4280287232, 4280287232, 4280156672, 4278589952, 4278328832,
+                4278198272, 4280156672, 4280483328, 4294363648, 4280287232, 4278376448, 4280287232, 4278647808,
+                4280287232, 4280287232, 4279373312, 4280287232, 4280287232, 4280156672, 4280287232, 4278198272,
+                4278198272, 4280156672, 4280287232, 4280287232, 4293669888, 4278765568, 4278765568, 4280287232,
+                4280287232, 4280287232, 4279634432, 4279634432, 4280287232, 4280287232, 4280287232, 4280287232,
+                4280287232, 4280287232, 4280287232, 4280287232, 4279373312, 4279764992, 4293539328, 4279896576,
+                4280287232, 4280287232, 4280287232, 4279634432, 4278198272, 4279634432, 4280287232, 4280287232,
+                4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4279503872, 4279503872, 4280288256,
+                4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232,
+                4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232
+            };
+
+            uint[] literals =
+            {
+                198, 0, 14, 0, 46, 0, 22, 0, 36, 0, 24, 0, 12, 0, 10, 0, 10, 0, 2, 0, 2, 0, 0, 0, 0, 0, 6, 0, 0, 0,
+                10, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 0, 0, 0, 6, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 6, 0, 2, 0, 2, 0, 2, 0, 0, 0, 8, 0, 2, 0, 38, 0, 4
+            };
+
+            uint[] expectedLiterals = new uint[1305];
+
+            // All remaining values are expected to be zero.
+            literals.AsSpan().CopyTo(expectedLiterals);
+
+            var backwardRefs = new Vp8LBackwardRefs(pixelData.Length);
+            for (int i = 0; i < pixelData.Length; i++)
+            {
+                backwardRefs.Add(new PixOrCopy()
+                {
+                    BgraOrDistance = pixelData[i],
+                    Len = 1,
+                    Mode = PixOrCopyMode.Literal
+                });
+            }
+
+            var histogram0 = new Vp8LHistogram(backwardRefs, 3);
+            var histogram1 = new Vp8LHistogram(backwardRefs, 3);
+            for (int i = 0; i < 5; i++)
+            {
+                histogram0.IsUsed[i] = true;
+                histogram1.IsUsed[i] = true;
+            }
+
+            var output = new Vp8LHistogram(3);
+
+            // act
+            histogram0.Add(histogram1, output);
+
+            // assert
+            Assert.True(output.Literal.SequenceEqual(expectedLiterals));
+        }
+
+        [Fact]
+        public void AddVector_Works() => RunAddVectorTest();
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void AddVector_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddVectorTest, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void AddVector_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddVectorTest, HwIntrinsics.DisableAVX2);
+#endif
+    }
+}
--- a/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/YuvConversionTests.cs
@ -2,10 +2,14 @@
 // Licensed under the Apache License, Version 2.0.

 using System;
+using System.IO;
 using SixLabors.ImageSharp.Advanced;
+using SixLabors.ImageSharp.Formats.Webp;
 using SixLabors.ImageSharp.Formats.Webp.Lossy;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
+using SixLabors.ImageSharp.Tests.TestUtilities;
+using SixLabors.ImageSharp.Tests.TestUtilities.ReferenceCodecs;
 using Xunit;

 namespace SixLabors.ImageSharp.Tests.Formats.Webp
@ -13,6 +17,34 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
    [Trait("Format", "Webp")]
    public class YuvConversionTests
    {
+        private static WebpDecoder WebpDecoder => new();
+
+        private static MagickReferenceDecoder ReferenceDecoder => new();
+
+        private static string TestImageLossyFullPath => Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, TestImages.Webp.Lossy.NoFilter06);
+
+        public static void RunUpSampleYuvToRgbTest()
+        {
+            var provider = TestImageProvider<Rgba32>.File(TestImageLossyFullPath);
+            using (Image<Rgba32> image = provider.GetImage(WebpDecoder))
+            {
+                image.DebugSave(provider);
+                image.CompareToOriginal(provider, ReferenceDecoder);
+            }
+        }
+
+        [Fact]
+        public void UpSampleYuvToRgb_Works() => RunUpSampleYuvToRgbTest();
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void UpSampleYuvToRgb_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunUpSampleYuvToRgbTest, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void UpSampleYuvToRgb_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunUpSampleYuvToRgbTest, HwIntrinsics.DisableSSE2);
+
+#endif
+
        [Theory]
        [WithFile(TestImages.Webp.Yuv, PixelTypes.Rgba32)]
        public void ConvertRgbToYuv_Works<TPixel>(TestImageProvider<TPixel> provider)
--- a/tests/ImageSharp.Tests/Processing/Processors/Convolution/Basic1ParameterConvolutionTests.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Convolution/Basic1ParameterConvolutionTests.cs
@ -17,10 +17,11 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution
        public static readonly TheoryData<int> Values = new TheoryData<int> { 3, 5 };

        public static readonly string[] InputImages =
-            {
-                TestImages.Bmp.Car,
-                TestImages.Png.CalliphoraPartial
-            };
+        {
+            TestImages.Bmp.Car,
+            TestImages.Png.CalliphoraPartial,
+            TestImages.Png.Blur
+        };

        [Theory]
        [WithFileCollection(nameof(InputImages), nameof(Values), PixelTypes.Rgba32)]
--- a/tests/ImageSharp.Tests/TestImages.cs
+++ b/tests/ImageSharp.Tests/TestImages.cs
@ -163,7 +163,7 @@ namespace SixLabors.ImageSharp.Tests
                public const string Fb = "Jpg/progressive/fb.jpg";
                public const string Progress = "Jpg/progressive/progress.jpg";
                public const string Festzug = "Jpg/progressive/Festzug.jpg";
-                public const string Winter = "Jpg/progressive/winter.jpg";
+                public const string Winter420_NonInterleaved = "Jpg/progressive/winter420_noninterleaved.jpg";

                public static class Bad
                {
@ -213,6 +213,7 @@ namespace SixLabors.ImageSharp.Tests
                public const string ArithmeticCoding = "Jpg/baseline/arithmetic_coding.jpg";
                public const string ArithmeticCodingProgressive = "Jpg/progressive/arithmetic_progressive.jpg";
                public const string Lossless = "Jpg/baseline/lossless.jpg";
+                public const string Winter444_Interleaved = "Jpg/baseline/winter444_interleaved.jpg";

                public static readonly string[] All =
                {
--- a/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_3.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_3.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6dd45683953e7cecbbaaa339b78db1303f9583b8d0988fe1948c6b1b4ba297a
+size 121550
--- a/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_5.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/InBox_Rgba32_blur_5.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3867cbbc1d425ceba20dd392de0728ce4de652860491e87434cd33675f56d8e
+size 117863
--- a/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_3.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_3.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:186c35bc159c7125f59b47866021051ff74368b9021dd09ad3c6386b39be3546
+size 80992
--- a/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_5.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/BoxBlurTest/OnFullImage_Rgba32_blur_5.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d7413d1d7ac69feb1d1f0a61d0d4a8228d3276337446d2c761ce58b0813cf66
+size 67243
--- a/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_3.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_3.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ace61fd7330b5e52b7aa09af937259d200b71fa152bf1ffdc6b891e5b61abfd5
+size 117133
--- a/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_5.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/InBox_Rgba32_blur_5.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc2f26bda2dec8354d8b77887806012f28f54b8a8f7e39e7e4bcb4d872d29042
+size 114247
--- a/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_3.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_3.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3aac58316fa795c2683f7cfac34f69ba71501abd78e0d72076cc36c439a8fa7a
+size 63680
--- a/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_5.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianBlurTest/OnFullImage_Rgba32_blur_5.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7bf28351fa51e0e9b0c2fd4b3fc7a30b0b3a8c1ca2dc9dd62ec5fab56e22c10
+size 50451
--- a/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_3.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_3.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6eecdf3bf90a2dd9430ce8501ab98f7a25f4f06674673fd6b9ca6a44435d303
+size 239962
--- a/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_5.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/InBox_Rgba32_blur_5.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cc3a46595d648a4551f499e1246ccdb63a80f424487fb7306fd3cfd772f5f1e
+size 238816
--- a/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_3.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_3.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59ca62ae017d8f5a19dbd0f61ded29d936c325553eb3e08fe39f2440d4c941eb
+size 356290
--- a/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_5.png
+++ b/tests/Images/External/ReferenceOutput/Convolution/GaussianSharpenTest/OnFullImage_Rgba32_blur_5.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:427d325ace605fe9a22702dcd8bff20dff888293def6569c4dc635b56c732565
+size 351992
--- a/tests/Images/External/ReferenceOutput/JpegDecoderTests/DecodeBaselineJpeg_jpeg422.png
+++ b/tests/Images/External/ReferenceOutput/JpegDecoderTests/DecodeBaselineJpeg_jpeg422.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:733cc46271c4402974db2536a55e6ecae3110856df73031ca48dad03745d852d
+size 35375
--- a/tests/Images/Input/Jpg/baseline/winter444_interleaved.jpg
+++ b/tests/Images/Input/Jpg/baseline/winter444_interleaved.jpg
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73b1deb4e2fb8027f6bb4fb293e5b2615c80b3ac0a7f99fd90118fd340a9fd12
+size 283330
--- a/tests/Images/Input/Jpg/progressive/winter420_noninterleaved.jpg
+++ b/tests/Images/Input/Jpg/progressive/winter420_noninterleaved.jpg