clean up and organize Jpeg test utilities

9 years ago · f6b5b420c7
23 changed files with 1637 additions and 1631 deletions
--- a/src/ImageSharp/Formats/Jpeg/GolangPort/Utils/MutableSpanExtensions.cs
+++ b/src/ImageSharp/Formats/Jpeg/GolangPort/Utils/MutableSpanExtensions.cs
@ -9,9 +9,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Utils
    using System;

    /// <summary>
-    /// MutableSpan Extensions
+    /// Span Extensions
    /// </summary>
-    internal static class MutableSpanExtensions
+    internal static class SpanExtensions
    {
        /// <summary>
        /// Save to a Vector4
@ -90,7 +90,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Utils
        /// </summary>
        /// <param name="src">Source</param>
        /// <returns>A new <see cref="Span{T}"/> with float values</returns>
-        public static Span<int> ConvertToInt32MutableSpan(this Span<float> src)
+        public static Span<int> ConvertToInt32Span(this Span<float> src)
        {
            int[] result = new int[src.Length];
            for (int i = 0; i < src.Length; i++)
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@ -1,24 +1,23 @@
 // Copyright (c) Six Labors and contributors.
 // Licensed under the Apache License, Version 2.0.

-using System.Diagnostics;

-using SixLabors.ImageSharp.Formats;
-using SixLabors.ImageSharp.Formats.Jpeg.Common;
-using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Components;
-using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Utils;
-
-using Xunit;
-using Xunit.Abstractions;

 // Uncomment this to turn unit tests into benchmarks:
 //#define BENCHMARKING

 // ReSharper disable InconsistentNaming

-namespace SixLabors.ImageSharp.Tests
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 {
    using System;
+    using System.Diagnostics;
+
+    using SixLabors.ImageSharp.Formats.Jpeg.Common;
+    using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
+
+    using Xunit;
+    using Xunit.Abstractions;

    public class Block8x8FTests : JpegUtilityTestFixture
    {
@ -222,88 +221,7 @@ namespace SixLabors.ImageSharp.Tests
            sw.Stop();
            this.Output.WriteLine($"TranposeInto_PinningImpl_Benchmark finished in {sw.ElapsedMilliseconds} ms");
        }
-
-        [Fact]
-        public void iDCT2D8x4_LeftPart()
-        {
-            float[] sourceArray = Create8x8FloatData();
-            float[] expectedDestArray = new float[64];
-
-            ReferenceImplementations.iDCT2D8x4_32f(sourceArray, expectedDestArray);
-
-            Block8x8F source = new Block8x8F();
-            source.LoadFrom(sourceArray);
-
-            Block8x8F dest = new Block8x8F();
-
-            DCT.IDCT8x4_LeftPart(ref source, ref dest);
-
-            float[] actualDestArray = new float[64];
-            dest.CopyTo(actualDestArray);
-
-            this.Print8x8Data(expectedDestArray);
-            this.Output.WriteLine("**************");
-            this.Print8x8Data(actualDestArray);
-
-            Assert.Equal(expectedDestArray, actualDestArray);
-        }
-
-        [Fact]
-        public void iDCT2D8x4_RightPart()
-        {
-            float[] sourceArray = Create8x8FloatData();
-            float[] expectedDestArray = new float[64];
-
-            ReferenceImplementations.iDCT2D8x4_32f(sourceArray.AsSpan().Slice(4), expectedDestArray.AsSpan().Slice(4));
-
-            Block8x8F source = new Block8x8F();
-            source.LoadFrom(sourceArray);
-
-            Block8x8F dest = new Block8x8F();
-
-            DCT.IDCT8x4_RightPart(ref source, ref dest);
-
-            float[] actualDestArray = new float[64];
-            dest.CopyTo(actualDestArray);
-
-            this.Print8x8Data(expectedDestArray);
-            this.Output.WriteLine("**************");
-            this.Print8x8Data(actualDestArray);
-
-            Assert.Equal(expectedDestArray, actualDestArray);
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        [InlineData(3)]
-        public void TransformIDCT(int seed)
-        {
-            Span<float> sourceArray = Create8x8RandomFloatData(-200, 200, seed);
-            float[] expectedDestArray = new float[64];
-            float[] tempArray = new float[64];
-
-            ReferenceImplementations.iDCT2D_llm(sourceArray, expectedDestArray, tempArray);
-
-            // ReferenceImplementations.iDCT8x8_llm_sse(sourceArray, expectedDestArray, tempArray);
-            Block8x8F source = new Block8x8F();
-            source.LoadFrom(sourceArray);
-
-            Block8x8F dest = new Block8x8F();
-            Block8x8F tempBuffer = new Block8x8F();
-
-            DCT.TransformIDCT(ref source, ref dest, ref tempBuffer);
-
-            float[] actualDestArray = new float[64];
-            dest.CopyTo(actualDestArray);
-
-            this.Print8x8Data(expectedDestArray);
-            this.Output.WriteLine("**************");
-            this.Print8x8Data(actualDestArray);
-            Assert.Equal(expectedDestArray, actualDestArray, new ApproximateFloatComparer(1f));
-            Assert.Equal(expectedDestArray, actualDestArray, new ApproximateFloatComparer(1f));
-        }
-
+        
        [Fact]
        public unsafe void CopyColorsTo()
        {
@ -367,74 +285,6 @@ namespace SixLabors.ImageSharp.Tests
            }
        }

-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public void FDCT8x4_LeftPart(int seed)
-        {
-            Span<float> src = Create8x8RandomFloatData(-200, 200, seed);
-            Block8x8F srcBlock = new Block8x8F();
-            srcBlock.LoadFrom(src);
-
-            Block8x8F destBlock = new Block8x8F();
-
-            float[] expectedDest = new float[64];
-
-            ReferenceImplementations.fDCT2D8x4_32f(src, expectedDest);
-            DCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock);
-
-            float[] actualDest = new float[64];
-            destBlock.CopyTo(actualDest);
-
-            Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public void FDCT8x4_RightPart(int seed)
-        {
-            Span<float> src = Create8x8RandomFloatData(-200, 200, seed);
-            Block8x8F srcBlock = new Block8x8F();
-            srcBlock.LoadFrom(src);
-
-            Block8x8F destBlock = new Block8x8F();
-
-            float[] expectedDest = new float[64];
-
-            ReferenceImplementations.fDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan().Slice(4));
-            DCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock);
-
-            float[] actualDest = new float[64];
-            destBlock.CopyTo(actualDest);
-
-            Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-        }
-
-        [Theory]
-        [InlineData(1)]
-        [InlineData(2)]
-        public void TransformFDCT(int seed)
-        {
-            Span<float> src = Create8x8RandomFloatData(-200, 200, seed);
-            Block8x8F srcBlock = new Block8x8F();
-            srcBlock.LoadFrom(src);
-
-            Block8x8F destBlock = new Block8x8F();
-
-            float[] expectedDest = new float[64];
-            float[] temp1 = new float[64];
-            Block8x8F temp2 = new Block8x8F();
-
-            ReferenceImplementations.fDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
-            DCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false);
-
-            float[] actualDest = new float[64];
-            destBlock.CopyTo(actualDest);
-
-            Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-        }
-
        [Theory]
        [InlineData(1)]
        [InlineData(2)]
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
@ -1,9 +1,8 @@
 // ReSharper disable InconsistentNaming
-namespace SixLabors.ImageSharp.Tests
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 {
-    using Moq;
-
    using SixLabors.ImageSharp.Formats.Jpeg.Common;
+    using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;

    using Xunit;
    using Xunit.Abstractions;
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@ -0,0 +1,175 @@
+// ReSharper disable InconsistentNaming
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg
+{
+    using System;
+
+    using SixLabors.ImageSharp.Formats.Jpeg.Common;
+    using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Components;
+    using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
+
+    using Xunit;
+    using Xunit.Abstractions;
+
+    public static class DCTTests
+    {
+        public class FastFloatingPoint : JpegUtilityTestFixture
+        {
+            public FastFloatingPoint(ITestOutputHelper output)
+                : base(output)
+            {
+            }
+
+
+            [Fact]
+            public void iDCT2D8x4_LeftPart()
+            {
+                float[] sourceArray = JpegUtilityTestFixture.Create8x8FloatData();
+                float[] expectedDestArray = new float[64];
+
+                ReferenceImplementations.FastFloatingPointDCT.iDCT2D8x4_32f(sourceArray, expectedDestArray);
+
+                Block8x8F source = new Block8x8F();
+                source.LoadFrom(sourceArray);
+
+                Block8x8F dest = new Block8x8F();
+
+                DCT.IDCT8x4_LeftPart(ref source, ref dest);
+
+                float[] actualDestArray = new float[64];
+                dest.CopyTo(actualDestArray);
+
+                this.Print8x8Data(expectedDestArray);
+                this.Output.WriteLine("**************");
+                this.Print8x8Data(actualDestArray);
+
+                Assert.Equal(expectedDestArray, actualDestArray);
+            }
+
+            [Fact]
+            public void iDCT2D8x4_RightPart()
+            {
+                float[] sourceArray = JpegUtilityTestFixture.Create8x8FloatData();
+                float[] expectedDestArray = new float[64];
+
+                ReferenceImplementations.FastFloatingPointDCT.iDCT2D8x4_32f(sourceArray.AsSpan().Slice(4), expectedDestArray.AsSpan().Slice(4));
+
+                Block8x8F source = new Block8x8F();
+                source.LoadFrom(sourceArray);
+
+                Block8x8F dest = new Block8x8F();
+
+                DCT.IDCT8x4_RightPart(ref source, ref dest);
+
+                float[] actualDestArray = new float[64];
+                dest.CopyTo(actualDestArray);
+
+                this.Print8x8Data(expectedDestArray);
+                this.Output.WriteLine("**************");
+                this.Print8x8Data(actualDestArray);
+
+                Assert.Equal(expectedDestArray, actualDestArray);
+            }
+
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            [InlineData(3)]
+            public void TransformIDCT(int seed)
+            {
+                Span<float> sourceArray = JpegUtilityTestFixture.Create8x8RandomFloatData(-200, 200, seed);
+                float[] expectedDestArray = new float[64];
+                float[] tempArray = new float[64];
+
+                ReferenceImplementations.FastFloatingPointDCT.iDCT2D_llm(sourceArray, expectedDestArray, tempArray);
+
+                // ReferenceImplementations.iDCT8x8_llm_sse(sourceArray, expectedDestArray, tempArray);
+                Block8x8F source = new Block8x8F();
+                source.LoadFrom(sourceArray);
+
+                Block8x8F dest = new Block8x8F();
+                Block8x8F tempBuffer = new Block8x8F();
+
+                DCT.TransformIDCT(ref source, ref dest, ref tempBuffer);
+
+                float[] actualDestArray = new float[64];
+                dest.CopyTo(actualDestArray);
+
+                this.Print8x8Data(expectedDestArray);
+                this.Output.WriteLine("**************");
+                this.Print8x8Data(actualDestArray);
+                Assert.Equal(expectedDestArray, actualDestArray, new ApproximateFloatComparer(1f));
+                Assert.Equal(expectedDestArray, actualDestArray, new ApproximateFloatComparer(1f));
+            }
+
+
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            public void FDCT8x4_LeftPart(int seed)
+            {
+                Span<float> src = JpegUtilityTestFixture.Create8x8RandomFloatData(-200, 200, seed);
+                Block8x8F srcBlock = new Block8x8F();
+                srcBlock.LoadFrom(src);
+
+                Block8x8F destBlock = new Block8x8F();
+
+                float[] expectedDest = new float[64];
+
+                ReferenceImplementations.FastFloatingPointDCT.fDCT2D8x4_32f(src, expectedDest);
+                DCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock);
+
+                float[] actualDest = new float[64];
+                destBlock.CopyTo(actualDest);
+
+                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+            }
+
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            public void FDCT8x4_RightPart(int seed)
+            {
+                Span<float> src = JpegUtilityTestFixture.Create8x8RandomFloatData(-200, 200, seed);
+                Block8x8F srcBlock = new Block8x8F();
+                srcBlock.LoadFrom(src);
+
+                Block8x8F destBlock = new Block8x8F();
+
+                float[] expectedDest = new float[64];
+
+                ReferenceImplementations.FastFloatingPointDCT.fDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan().Slice(4));
+                DCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock);
+
+                float[] actualDest = new float[64];
+                destBlock.CopyTo(actualDest);
+
+                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+            }
+
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            public void TransformFDCT(int seed)
+            {
+                Span<float> src = JpegUtilityTestFixture.Create8x8RandomFloatData(-200, 200, seed);
+                Block8x8F srcBlock = new Block8x8F();
+                srcBlock.LoadFrom(src);
+
+                Block8x8F destBlock = new Block8x8F();
+
+                float[] expectedDest = new float[64];
+                float[] temp1 = new float[64];
+                Block8x8F temp2 = new Block8x8F();
+
+                ReferenceImplementations.FastFloatingPointDCT.fDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
+                DCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false);
+
+                float[] actualDest = new float[64];
+                destBlock.CopyTo(actualDest);
+
+                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+            }
+
+        }
+    }
+}
--- a/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegDecoderTests.cs
@ -1,16 +1,11 @@
 // Copyright (c) Six Labors and contributors.
 // Licensed under the Apache License, Version 2.0.

-using System;
-using System.IO;
-using SixLabors.ImageSharp.Formats;
-using SixLabors.ImageSharp.PixelFormats;
-using Xunit;
+

 // ReSharper disable InconsistentNaming
-namespace SixLabors.ImageSharp.Tests
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 {
-    using System;
    using System.IO;
    using System.Linq;

@ -19,8 +14,8 @@ namespace SixLabors.ImageSharp.Tests
    using SixLabors.ImageSharp.Formats.Jpeg.GolangPort;
    using SixLabors.ImageSharp.Formats.Jpeg.PdfJsPort;
    using SixLabors.ImageSharp.PixelFormats;
+    using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
    using SixLabors.ImageSharp.Tests.TestUtilities.ImageComparison;
-    using SixLabors.ImageSharp.Tests.TestUtilities.ReferenceCodecs;

    using Xunit;
    using Xunit.Abstractions;
--- a/tests/ImageSharp.Tests/Formats/Jpg/JpegEncoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegEncoderTests.cs
@ -1,22 +1,23 @@
 // Copyright (c) Six Labors and contributors.
 // Licensed under the Apache License, Version 2.0.

-using System.Collections.Generic;
-using System.IO;

-using SixLabors.ImageSharp.Formats;
-using SixLabors.ImageSharp.Formats.Jpeg;
-using SixLabors.ImageSharp.PixelFormats;
-using SixLabors.ImageSharp.Processing;
-using SixLabors.Primitives;
-
-using Xunit;
-using Xunit.Abstractions;

 // ReSharper disable InconsistentNaming

-namespace SixLabors.ImageSharp.Tests
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 {
+    using System.Collections.Generic;
+    using System.IO;
+
+    using SixLabors.ImageSharp.Formats.Jpeg;
+    using SixLabors.ImageSharp.PixelFormats;
+    using SixLabors.ImageSharp.Processing;
+    using SixLabors.Primitives;
+
+    using Xunit;
+    using Xunit.Abstractions;
+
    public class JpegEncoderTests : MeasureFixture
    {
        public static IEnumerable<string> AllBmpFiles => TestImages.Bmp.All;
--- a/tests/ImageSharp.Tests/Formats/Jpg/JpegProfilingBenchmarks.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegProfilingBenchmarks.cs
@ -1,18 +1,18 @@
 // Copyright (c) Six Labors and contributors.
 // Licensed under the Apache License, Version 2.0.

-using System;
-using System.IO;
-using System.Linq;
-using System.Numerics;
-using SixLabors.ImageSharp.Formats;
-using SixLabors.ImageSharp.Formats.Jpeg;
-using SixLabors.ImageSharp.PixelFormats;
-using Xunit;
-using Xunit.Abstractions;
-
-namespace SixLabors.ImageSharp.Tests
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 {
+    using System;
+    using System.IO;
+    using System.Linq;
+    using System.Numerics;
+
+    using SixLabors.ImageSharp.Formats.Jpeg;
+
+    using Xunit;
+    using Xunit.Abstractions;
+
    public class JpegProfilingBenchmarks : MeasureFixture
    {
        public JpegProfilingBenchmarks(ITestOutputHelper output)
--- a/tests/ImageSharp.Tests/Formats/Jpg/JpegUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegUtilsTests.cs
@ -1,16 +1,18 @@
 // Copyright (c) Six Labors and contributors.
 // Licensed under the Apache License, Version 2.0.

-using System.Numerics;

-using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Utils;
-using SixLabors.ImageSharp.PixelFormats;
-
-using Xunit;

 // ReSharper disable InconsistentNaming
-namespace SixLabors.ImageSharp.Tests
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 {
+    using System.Numerics;
+
+    using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Utils;
+    using SixLabors.ImageSharp.PixelFormats;
+
+    using Xunit;
+
    public class JpegUtilsTests
    {
        public static Image<TPixel> CreateTestImage<TPixel>()
--- a/tests/ImageSharp.Tests/Formats/Jpg/LibJpegTools.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/LibJpegTools.cs
@ -1,483 +0,0 @@
-namespace SixLabors.ImageSharp.Tests
-{
-    using System;
-    using System.Collections.Generic;
-    using System.Diagnostics;
-    using System.IO;
-    using System.Linq;
-    using System.Numerics;
-    using System.Reflection;
-    
-    using SixLabors.ImageSharp.Formats.Jpeg;
-    using SixLabors.ImageSharp.Formats.Jpeg.Common;
-    using SixLabors.ImageSharp.Formats.Jpeg.GolangPort;
-    using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Components.Decoder;
-    using SixLabors.ImageSharp.Formats.Jpeg.PdfJsPort;
-    using SixLabors.ImageSharp.Formats.Jpeg.PdfJsPort.Components;
-    using SixLabors.ImageSharp.Memory;
-    using SixLabors.ImageSharp.PixelFormats;
-    using SixLabors.Primitives;
-
-    using Xunit;
-
-    internal static class LibJpegTools
-    {
-        public class SpectralData : IEquatable<SpectralData>
-        {
-            public int ComponentCount { get; private set; }
-
-            public ComponentData[] Components { get; private set; }
-
-            private SpectralData(Array wholeImage)
-            {
-                this.ComponentCount = 0;
-
-                for (int i = 0; i < wholeImage.Length && wholeImage.GetValue(i) != null; i++)
-                {
-                    this.ComponentCount++;
-                }
-
-                this.Components = new ComponentData[this.ComponentCount];
-
-                for (int i = 0; i < this.ComponentCount; i++)
-                {
-                    object jVirtArray = wholeImage.GetValue(i);
-                    Array bloxSource = (Array)GetNonPublicMember(jVirtArray, "m_buffer");
-
-                    this.Components[i] = ComponentData.Load(bloxSource, i);
-                }
-            }
-
-            internal SpectralData(ComponentData[] components)
-            {
-                this.ComponentCount = components.Length;
-                this.Components = components;
-            }
-            
-            public static SpectralData LoadFromImageSharpDecoder(JpegDecoderCore decoder)
-            {
-                FrameComponent[] srcComponents = decoder.Frame.Components;
-                ComponentData[] destComponents = srcComponents.Select(ComponentData.Load).ToArray();
-
-                return new SpectralData(destComponents);
-            }
-
-            public static SpectralData LoadFromImageSharpDecoder(OldJpegDecoderCore decoder)
-            {
-                OldComponent[] srcComponents = decoder.Components;
-                ComponentData[] destComponents = srcComponents.Select(ComponentData.Load).ToArray();
-
-                return new SpectralData(destComponents);
-            }
-
-            public Image<Rgba32> TryCreateRGBSpectralImage()
-            {
-                if (this.ComponentCount != 3) return null;
-
-                ComponentData c0 = this.Components[0];
-                ComponentData c1 = this.Components[1];
-                ComponentData c2 = this.Components[2];
-
-                if (c0.Size != c1.Size || c1.Size != c2.Size)
-                {
-                    return null;
-                }
-
-                Image<Rgba32> result = new Image<Rgba32>(c0.WidthInBlocks * 8, c0.HeightInBlocks * 8);
-
-                for (int by = 0; by < c0.HeightInBlocks; by++)
-                {
-                    for (int bx = 0; bx < c0.WidthInBlocks; bx++)
-                    {
-                        this.WriteToImage(bx, by, result);
-                    }
-                }
-                return result;
-            }
-
-            internal void WriteToImage(int bx, int by, Image<Rgba32> image)
-            {
-                ComponentData c0 = this.Components[0];
-                ComponentData c1 = this.Components[1];
-                ComponentData c2 = this.Components[2];
-
-                Block8x8 block0 = c0.Blocks[bx, by];
-                Block8x8 block1 = c1.Blocks[bx, by];
-                Block8x8 block2 = c2.Blocks[bx, by];
-
-                float d0 = (c0.MaxVal - c0.MinVal);
-                float d1 = (c1.MaxVal - c1.MinVal);
-                float d2 = (c2.MaxVal - c2.MinVal);
-
-                for (int y = 0; y < 8; y++)
-                {
-                    for (int x = 0; x < 8; x++)
-                    {
-                        float val0 = c0.GetBlockValue(block0, x, y);
-                        float val1 = c0.GetBlockValue(block1, x, y);
-                        float val2 = c0.GetBlockValue(block2, x, y);
-
-                        Vector4 v = new Vector4(val0, val1, val2, 1);
-                        Rgba32 color = default(Rgba32);
-                        color.PackFromVector4(v);
-
-                        int yy = by * 8 + y;
-                        int xx = bx * 8 + x;
-                        image[xx, yy] = color;
-                    }
-                }
-            }
-
-            public bool Equals(SpectralData other)
-            {
-                if (ReferenceEquals(null, other)) return false;
-                if (ReferenceEquals(this, other)) return true;
-                if (this.ComponentCount != other.ComponentCount)
-                {
-                    return false;
-                }
-
-                for (int i = 0; i < this.ComponentCount; i++)
-                {
-                    ComponentData a = this.Components[i];
-                    ComponentData b = other.Components[i];
-                    if (!a.Equals(b)) return false;
-                }
-                return true;
-            }
-
-            public override bool Equals(object obj)
-            {
-                if (ReferenceEquals(null, obj)) return false;
-                if (ReferenceEquals(this, obj)) return true;
-                if (obj.GetType() != this.GetType()) return false;
-                return Equals((SpectralData)obj);
-            }
-
-            public override int GetHashCode()
-            {
-                unchecked
-                {
-                    return (this.ComponentCount * 397) ^ (this.Components != null ? this.Components[0].GetHashCode() : 0);
-                }
-            }
-
-            public static bool operator ==(SpectralData left, SpectralData right)
-            {
-                return Equals(left, right);
-            }
-
-            public static bool operator !=(SpectralData left, SpectralData right)
-            {
-                return !Equals(left, right);
-            }
-        }
-
-        public class ComponentData : IEquatable<ComponentData>, IJpegComponent
-        {
-            public ComponentData(int heightInBlocks, int widthInBlocks, int index)
-            {
-                this.HeightInBlocks = heightInBlocks;
-                this.WidthInBlocks = widthInBlocks;
-                this.Index = index;
-                this.Blocks = new Buffer2D<Block8x8>(this.WidthInBlocks, this.HeightInBlocks);
-            }
-
-            public Size Size => new Size(this.WidthInBlocks, this.HeightInBlocks);
-
-            public int Index { get; }
-
-            public int HeightInBlocks { get; }
-
-            public int WidthInBlocks { get; }
-
-            public Buffer2D<Block8x8> Blocks { get; private set; }
-
-            public short MinVal { get; private set; } = short.MaxValue;
-
-            public short MaxVal { get; private set; } = short.MinValue;
-            
-            public static ComponentData Load(Array bloxSource, int index)
-            {
-                int yCount = bloxSource.Length;
-                Array row0 = (Array)bloxSource.GetValue(0);
-                int xCount = row0.Length;
-                ComponentData result = new ComponentData(yCount, xCount, index);
-                result.Init(bloxSource);
-                return result;
-            }
-
-            private void Init(Array bloxSource)
-            {
-                for (int y = 0; y < bloxSource.Length; y++)
-                {
-                    Array row = (Array)bloxSource.GetValue(y);
-                    for (int x = 0; x < row.Length; x++)
-                    {
-                        object jBlock = row.GetValue(x);
-                        short[] data = (short[])GetNonPublicMember(jBlock, "data");
-                        this.MakeBlock(data, y, x);
-                    }
-                }
-            }
-
-            internal void MakeBlock(short[] data, int y, int x)
-            {
-                this.MinVal = Math.Min(this.MinVal, data.Min());
-                this.MaxVal = Math.Max(this.MaxVal, data.Max());
-                this.Blocks[x, y] = new Block8x8(data);
-            }
-
-            public static ComponentData Load(FrameComponent c, int index)
-            {
-                var result = new ComponentData(
-                    c.BlocksPerColumnForMcu,
-                    c.BlocksPerLineForMcu,
-                    index
-                    );
-
-                for (int y = 0; y < result.HeightInBlocks; y++)
-                {
-                    for (int x = 0; x < result.WidthInBlocks; x++)
-                    {
-                        short[] data = c.GetBlockBuffer(y, x).ToArray();
-                        result.MakeBlock(data, y, x);
-                    }
-                }
-
-                return result;
-            }
-
-            public static ComponentData Load(OldComponent c)
-            {
-                var result = new ComponentData(
-                    c.HeightInBlocks,
-                    c.WidthInBlocks,
-                    c.Index
-                );
-
-                for (int y = 0; y < result.HeightInBlocks; y++)
-                {
-                    for (int x = 0; x < result.WidthInBlocks; x++)
-                    {
-                        short[] data = c.GetBlockReference(x, y).ToArray();
-                        result.MakeBlock(data, y, x);
-                    }
-                }
-
-                return result;
-            }
-
-            public Image<Rgba32> CreateGrayScaleImage()
-            {
-                Image<Rgba32> result = new Image<Rgba32>(this.WidthInBlocks * 8, this.HeightInBlocks * 8);
-                
-                for (int by = 0; by < this.HeightInBlocks; by++)
-                {
-                    for (int bx = 0; bx < this.WidthInBlocks; bx++)
-                    {
-                        this.WriteToImage(bx, by, result);
-                    }
-                }
-                return result;
-            }
-
-            internal void WriteToImage(int bx, int by, Image<Rgba32> image)
-            {
-                Block8x8 block = this.Blocks[bx, by];
-                
-                for (int y = 0; y < 8; y++)
-                {
-                    for (int x = 0; x < 8; x++)
-                    {
-                        var val = this.GetBlockValue(block, x, y);
-
-                        Vector4 v = new Vector4(val, val, val, 1);
-                        Rgba32 color = default(Rgba32);
-                        color.PackFromVector4(v);
-
-                        int yy = by * 8 + y;
-                        int xx = bx * 8 + x;
-                        image[xx, yy] = color;
-                    }
-                }
-            }
-
-            internal float GetBlockValue(Block8x8 block, int x, int y)
-            {
-                float d = (this.MaxVal - this.MinVal);
-                float val = block.GetValueAt(x, y);
-                val -= this.MinVal;
-                val /= d;
-                return val;
-            }
-
-            public bool Equals(ComponentData other)
-            {
-                if (ReferenceEquals(null, other)) return false;
-                if (ReferenceEquals(this, other)) return true;
-                bool ok = this.Index == other.Index && this.HeightInBlocks == other.HeightInBlocks
-                          && this.WidthInBlocks == other.WidthInBlocks;
-                       //&& this.MinVal == other.MinVal
-                       //&& this.MaxVal == other.MaxVal;
-                if (!ok) return false;
-
-                for (int y = 0; y < this.HeightInBlocks; y++)
-                {
-                    for (int x = 0; x < this.WidthInBlocks; x++)
-                    {
-                        Block8x8 a = this.Blocks[x, y];
-                        Block8x8 b = other.Blocks[x, y];
-                        if (!a.Equals(b)) return false;
-                    }
-                }
-                return true;
-            }
-
-            public override bool Equals(object obj)
-            {
-                if (ReferenceEquals(null, obj)) return false;
-                if (ReferenceEquals(this, obj)) return true;
-                if (obj.GetType() != this.GetType()) return false;
-                return Equals((ComponentData)obj);
-            }
-
-            public override int GetHashCode()
-            {
-                unchecked
-                {
-                    var hashCode = this.Index;
-                    hashCode = (hashCode * 397) ^ this.HeightInBlocks;
-                    hashCode = (hashCode * 397) ^ this.WidthInBlocks;
-                    hashCode = (hashCode * 397) ^ this.MinVal.GetHashCode();
-                    hashCode = (hashCode * 397) ^ this.MaxVal.GetHashCode();
-                    return hashCode;
-                }
-            }
-
-            public static bool operator ==(ComponentData left, ComponentData right)
-            {
-                return Equals(left, right);
-            }
-
-            public static bool operator !=(ComponentData left, ComponentData right)
-            {
-                return !Equals(left, right);
-            }
-
-            
-        }
-
-        internal static FieldInfo GetNonPublicField(object obj, string fieldName)
-        {
-            Type type = obj.GetType();
-            return type.GetField(fieldName, BindingFlags.Instance | BindingFlags.NonPublic);
-        }
-
-        internal static object GetNonPublicMember(object obj, string fieldName)
-        {
-            FieldInfo fi = GetNonPublicField(obj, fieldName);
-            return fi.GetValue(obj);
-        }
-
-        public static (double total, double average) CalculateDifference(ComponentData expected, ComponentData actual)
-        {
-            BigInteger totalDiff = 0;
-            if (actual.WidthInBlocks < expected.WidthInBlocks)
-            {
-                throw new Exception("actual.WidthInBlocks < expected.WidthInBlocks");
-            }
-
-            if (actual.HeightInBlocks < expected.HeightInBlocks)
-            {
-                throw new Exception("actual.HeightInBlocks < expected.HeightInBlocks");
-            }
-
-            int w = expected.WidthInBlocks;
-            int h = expected.HeightInBlocks;
-            for (int y = 0; y < h; y++)
-            {
-                for (int x = 0; x < w; x++)
-                {
-                    Block8x8 aa = expected.Blocks[x, y];
-                    Block8x8 bb = actual.Blocks[x, y];
-
-                    long diff = Block8x8.TotalDifference(ref aa, ref bb);
-                    totalDiff += diff;
-                }
-            }
-            
-            int count = w * h;
-            double total = (double)totalDiff;
-            double average = (double)totalDiff / (count * Block8x8.Size);
-            return (total, average);
-        }
-
-        private static string DumpToolFullPath => Path.Combine(
-            TestEnvironment.ToolsDirectoryFullPath,
-            @"jpeg\dump-jpeg-coeffs.exe");
-
-        public static void RunDumpJpegCoeffsTool(string sourceFile, string destFile)
-        {
-            string args = $@"""{sourceFile}"" ""{destFile}""";
-            var process = Process.Start(DumpToolFullPath, args);
-            process.WaitForExit();
-        }
-
-        public static SpectralData ExtractSpectralData(string inputFile)
-        {
-            TestFile testFile = TestFile.Create(inputFile);
-
-            string outDir = TestEnvironment.CreateOutputDirectory(".Temp", $"JpegCoeffs");
-            string fn = $"{Path.GetFileName(inputFile)}-{new Random().Next(1000)}.dctcoeffs";
-            string coeffFileFullPath = Path.Combine(outDir, fn);
-
-            try
-            {
-                RunDumpJpegCoeffsTool(testFile.FullPath, coeffFileFullPath);
-                
-                using (var dumpStream = new FileStream(coeffFileFullPath, FileMode.Open))
-                using (var rdr = new BinaryReader(dumpStream))
-                {
-                    int componentCount = rdr.ReadInt16();
-                    ComponentData[] result = new ComponentData[componentCount];
-
-                    for (int i = 0; i < componentCount; i++)
-                    {
-                        int widthInBlocks = rdr.ReadInt16();
-                        int heightInBlocks = rdr.ReadInt16();
-                        ComponentData resultComponent = new ComponentData(heightInBlocks, widthInBlocks, i);
-                        result[i] = resultComponent;
-                    }
-
-                    byte[] buffer = new byte[64*sizeof(short)];
-
-                    for (int i = 0; i < result.Length; i++)
-                    {
-                        ComponentData c = result[i];
-
-                        for (int y = 0; y < c.HeightInBlocks; y++)
-                        {
-                            for (int x = 0; x < c.WidthInBlocks; x++)
-                            {
-                                rdr.Read(buffer, 0, buffer.Length);
-
-                                short[] block = buffer.AsSpan().NonPortableCast<byte, short>().ToArray();
-                                c.MakeBlock(block, y, x);
-                            }
-                        }
-                    }
-
-                    return new SpectralData(result);
-                }
-            }
-            finally
-            {
-                if (File.Exists(coeffFileFullPath))
-                {
-                    File.Delete(coeffFileFullPath);
-                }
-            }
-        }
-    }
-}
--- a/tests/ImageSharp.Tests/Formats/Jpg/LibJpegToolsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/LibJpegToolsTests.cs
@ -1,8 +1,9 @@
-namespace SixLabors.ImageSharp.Tests
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 {
    using System.IO;

    using SixLabors.ImageSharp.PixelFormats;
+    using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;

    using Xunit;

--- a/tests/ImageSharp.Tests/Formats/Jpg/ReferenceImplementations.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/ReferenceImplementations.cs
@ -1,918 +0,0 @@
-// Copyright (c) Six Labors and contributors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-
-using SixLabors.ImageSharp.Formats;
-using SixLabors.ImageSharp.Formats.Jpeg.Common;
-using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Utils;
-
-// ReSharper disable InconsistentNaming
-
-namespace SixLabors.ImageSharp.Tests
-{
-    /// <summary>
-    /// This class contains simplified (unefficient) reference implementations to produce verification data for unit tests
-    /// Floating point DCT code Ported from https://github.com/norishigefukushima/dct_simd
-    /// </summary>
-    internal static class ReferenceImplementations
-    {
-        /// <summary>
-        /// Transpose 8x8 block stored linearly in a <see cref="MutableSpan{T}"/> (inplace)
-        /// </summary>
-        /// <param name="data"></param>
-        internal static void Transpose8x8(Span<float> data)
-        {
-            for (int i = 1; i < 8; i++)
-            {
-                int i8 = i * 8;
-                for (int j = 0; j < i; j++)
-                {
-                    float tmp = data[i8 + j];
-                    data[i8 + j] = data[j * 8 + i];
-                    data[j * 8 + i] = tmp;
-                }
-            }
-        }
-
-        /// <summary>
-        /// Transpose 8x8 block stored linearly in a  <see cref="MutableSpan{T}"/>
-        /// </summary>
-        internal static void Transpose8x8(Span<float> src, Span<float> dest)
-        {
-            for (int i = 0; i < 8; i++)
-            {
-                int i8 = i * 8;
-                for (int j = 0; j < 8; j++)
-                {
-                    dest[j * 8 + i] = src[i8 + j];
-                }
-            }
-        }
-
-        /// <summary>
-        /// The "original" libjpeg/golang based DCT implementation is used as reference implementation for tests.
-        /// </summary>
-        public static class IntegerReferenceDCT
-        {
-            private const int fix_0_298631336 = 2446;
-            private const int fix_0_390180644 = 3196;
-            private const int fix_0_541196100 = 4433;
-            private const int fix_0_765366865 = 6270;
-            private const int fix_0_899976223 = 7373;
-            private const int fix_1_175875602 = 9633;
-            private const int fix_1_501321110 = 12299;
-            private const int fix_1_847759065 = 15137;
-            private const int fix_1_961570560 = 16069;
-            private const int fix_2_053119869 = 16819;
-            private const int fix_2_562915447 = 20995;
-            private const int fix_3_072711026 = 25172;
-
-            /// <summary>
-            /// The number of bits
-            /// </summary>
-            private const int Bits = 13;
-
-            /// <summary>
-            /// The number of bits to shift by on the first pass.
-            /// </summary>
-            private const int Pass1Bits = 2;
-
-            /// <summary>
-            /// The value to shift by
-            /// </summary>
-            private const int CenterJSample = 128;
-
-            /// <summary>
-            /// Performs a forward DCT on an 8x8 block of coefficients, including a level shift.
-            /// Leave results scaled up by an overall factor of 8.
-            /// </summary>
-            /// <param name="block">The block of coefficients.</param>
-            public static void TransformFDCTInplace(Span<int> block)
-            {
-                // Pass 1: process rows.
-                for (int y = 0; y < 8; y++)
-                {
-                    int y8 = y * 8;
-
-                    int x0 = block[y8];
-                    int x1 = block[y8 + 1];
-                    int x2 = block[y8 + 2];
-                    int x3 = block[y8 + 3];
-                    int x4 = block[y8 + 4];
-                    int x5 = block[y8 + 5];
-                    int x6 = block[y8 + 6];
-                    int x7 = block[y8 + 7];
-
-                    int tmp0 = x0 + x7;
-                    int tmp1 = x1 + x6;
-                    int tmp2 = x2 + x5;
-                    int tmp3 = x3 + x4;
-
-                    int tmp10 = tmp0 + tmp3;
-                    int tmp12 = tmp0 - tmp3;
-                    int tmp11 = tmp1 + tmp2;
-                    int tmp13 = tmp1 - tmp2;
-
-                    tmp0 = x0 - x7;
-                    tmp1 = x1 - x6;
-                    tmp2 = x2 - x5;
-                    tmp3 = x3 - x4;
-
-                    block[y8] = (tmp10 + tmp11 - (8 * CenterJSample)) << Pass1Bits;
-                    block[y8 + 4] = (tmp10 - tmp11) << Pass1Bits;
-                    int z1 = (tmp12 + tmp13) * fix_0_541196100;
-                    z1 += 1 << (Bits - Pass1Bits - 1);
-                    block[y8 + 2] = (z1 + (tmp12 * fix_0_765366865)) >> (Bits - Pass1Bits);
-                    block[y8 + 6] = (z1 - (tmp13 * fix_1_847759065)) >> (Bits - Pass1Bits);
-
-                    tmp10 = tmp0 + tmp3;
-                    tmp11 = tmp1 + tmp2;
-                    tmp12 = tmp0 + tmp2;
-                    tmp13 = tmp1 + tmp3;
-                    z1 = (tmp12 + tmp13) * fix_1_175875602;
-                    z1 += 1 << (Bits - Pass1Bits - 1);
-                    tmp0 = tmp0 * fix_1_501321110;
-                    tmp1 = tmp1 * fix_3_072711026;
-                    tmp2 = tmp2 * fix_2_053119869;
-                    tmp3 = tmp3 * fix_0_298631336;
-                    tmp10 = tmp10 * -fix_0_899976223;
-                    tmp11 = tmp11 * -fix_2_562915447;
-                    tmp12 = tmp12 * -fix_0_390180644;
-                    tmp13 = tmp13 * -fix_1_961570560;
-
-                    tmp12 += z1;
-                    tmp13 += z1;
-                    block[y8 + 1] = (tmp0 + tmp10 + tmp12) >> (Bits - Pass1Bits);
-                    block[y8 + 3] = (tmp1 + tmp11 + tmp13) >> (Bits - Pass1Bits);
-                    block[y8 + 5] = (tmp2 + tmp11 + tmp12) >> (Bits - Pass1Bits);
-                    block[y8 + 7] = (tmp3 + tmp10 + tmp13) >> (Bits - Pass1Bits);
-                }
-
-                // Pass 2: process columns.
-                // We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8.
-                for (int x = 0; x < 8; x++)
-                {
-                    int tmp0 = block[x] + block[56 + x];
-                    int tmp1 = block[8 + x] + block[48 + x];
-                    int tmp2 = block[16 + x] + block[40 + x];
-                    int tmp3 = block[24 + x] + block[32 + x];
-
-                    int tmp10 = tmp0 + tmp3 + (1 << (Pass1Bits - 1));
-                    int tmp12 = tmp0 - tmp3;
-                    int tmp11 = tmp1 + tmp2;
-                    int tmp13 = tmp1 - tmp2;
-
-                    tmp0 = block[x] - block[56 + x];
-                    tmp1 = block[8 + x] - block[48 + x];
-                    tmp2 = block[16 + x] - block[40 + x];
-                    tmp3 = block[24 + x] - block[32 + x];
-
-                    block[x] = (tmp10 + tmp11) >> Pass1Bits;
-                    block[32 + x] = (tmp10 - tmp11) >> Pass1Bits;
-
-                    int z1 = (tmp12 + tmp13) * fix_0_541196100;
-                    z1 += 1 << (Bits + Pass1Bits - 1);
-                    block[16 + x] = (z1 + (tmp12 * fix_0_765366865)) >> (Bits + Pass1Bits);
-                    block[48 + x] = (z1 - (tmp13 * fix_1_847759065)) >> (Bits + Pass1Bits);
-
-                    tmp10 = tmp0 + tmp3;
-                    tmp11 = tmp1 + tmp2;
-                    tmp12 = tmp0 + tmp2;
-                    tmp13 = tmp1 + tmp3;
-                    z1 = (tmp12 + tmp13) * fix_1_175875602;
-                    z1 += 1 << (Bits + Pass1Bits - 1);
-                    tmp0 = tmp0 * fix_1_501321110;
-                    tmp1 = tmp1 * fix_3_072711026;
-                    tmp2 = tmp2 * fix_2_053119869;
-                    tmp3 = tmp3 * fix_0_298631336;
-                    tmp10 = tmp10 * -fix_0_899976223;
-                    tmp11 = tmp11 * -fix_2_562915447;
-                    tmp12 = tmp12 * -fix_0_390180644;
-                    tmp13 = tmp13 * -fix_1_961570560;
-
-                    tmp12 += z1;
-                    tmp13 += z1;
-                    block[8 + x] = (tmp0 + tmp10 + tmp12) >> (Bits + Pass1Bits);
-                    block[24 + x] = (tmp1 + tmp11 + tmp13) >> (Bits + Pass1Bits);
-                    block[40 + x] = (tmp2 + tmp11 + tmp12) >> (Bits + Pass1Bits);
-                    block[56 + x] = (tmp3 + tmp10 + tmp13) >> (Bits + Pass1Bits);
-                }
-
-            }
-            private const int w1 = 2841; // 2048*sqrt(2)*cos(1*pi/16)
-            private const int w2 = 2676; // 2048*sqrt(2)*cos(2*pi/16)
-            private const int w3 = 2408; // 2048*sqrt(2)*cos(3*pi/16)
-            private const int w5 = 1609; // 2048*sqrt(2)*cos(5*pi/16)
-            private const int w6 = 1108; // 2048*sqrt(2)*cos(6*pi/16)
-            private const int w7 = 565;  // 2048*sqrt(2)*cos(7*pi/16)
-
-            private const int w1pw7 = w1 + w7;
-            private const int w1mw7 = w1 - w7;
-            private const int w2pw6 = w2 + w6;
-            private const int w2mw6 = w2 - w6;
-            private const int w3pw5 = w3 + w5;
-            private const int w3mw5 = w3 - w5;
-
-            private const int r2 = 181; // 256/sqrt(2)
-
-            /// <summary>
-            /// Performs a 2-D Inverse Discrete Cosine Transformation.
-            /// <para>
-            /// The input coefficients should already have been multiplied by the
-            /// appropriate quantization table. We use fixed-point computation, with the
-            /// number of bits for the fractional component varying over the intermediate
-            /// stages.
-            /// </para>
-            /// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
-            /// discrete W transform and for the discrete Fourier transform", IEEE Trans. on
-            /// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
-            /// </summary>
-            /// <param name="src">The source block of coefficients</param>
-            public static void TransformIDCTInplace(Span<int> src)
-            {
-                // Horizontal 1-D IDCT.
-                for (int y = 0; y < 8; y++)
-                {
-                    int y8 = y * 8;
-
-                    // If all the AC components are zero, then the IDCT is trivial.
-                    if (src[y8 + 1] == 0 && src[y8 + 2] == 0 && src[y8 + 3] == 0 &&
-                        src[y8 + 4] == 0 && src[y8 + 5] == 0 && src[y8 + 6] == 0 && src[y8 + 7] == 0)
-                    {
-                        int dc = src[y8 + 0] << 3;
-                        src[y8 + 0] = dc;
-                        src[y8 + 1] = dc;
-                        src[y8 + 2] = dc;
-                        src[y8 + 3] = dc;
-                        src[y8 + 4] = dc;
-                        src[y8 + 5] = dc;
-                        src[y8 + 6] = dc;
-                        src[y8 + 7] = dc;
-                        continue;
-                    }
-
-                    // Prescale.
-                    int x0 = (src[y8 + 0] << 11) + 128;
-                    int x1 = src[y8 + 4] << 11;
-                    int x2 = src[y8 + 6];
-                    int x3 = src[y8 + 2];
-                    int x4 = src[y8 + 1];
-                    int x5 = src[y8 + 7];
-                    int x6 = src[y8 + 5];
-                    int x7 = src[y8 + 3];
-
-                    // Stage 1.
-                    int x8 = w7 * (x4 + x5);
-                    x4 = x8 + (w1mw7 * x4);
-                    x5 = x8 - (w1pw7 * x5);
-                    x8 = w3 * (x6 + x7);
-                    x6 = x8 - (w3mw5 * x6);
-                    x7 = x8 - (w3pw5 * x7);
-
-                    // Stage 2.
-                    x8 = x0 + x1;
-                    x0 -= x1;
-                    x1 = w6 * (x3 + x2);
-                    x2 = x1 - (w2pw6 * x2);
-                    x3 = x1 + (w2mw6 * x3);
-                    x1 = x4 + x6;
-                    x4 -= x6;
-                    x6 = x5 + x7;
-                    x5 -= x7;
-
-                    // Stage 3.
-                    x7 = x8 + x3;
-                    x8 -= x3;
-                    x3 = x0 + x2;
-                    x0 -= x2;
-                    x2 = ((r2 * (x4 + x5)) + 128) >> 8;
-                    x4 = ((r2 * (x4 - x5)) + 128) >> 8;
-
-                    // Stage 4.
-                    src[y8 + 0] = (x7 + x1) >> 8;
-                    src[y8 + 1] = (x3 + x2) >> 8;
-                    src[y8 + 2] = (x0 + x4) >> 8;
-                    src[y8 + 3] = (x8 + x6) >> 8;
-                    src[y8 + 4] = (x8 - x6) >> 8;
-                    src[y8 + 5] = (x0 - x4) >> 8;
-                    src[y8 + 6] = (x3 - x2) >> 8;
-                    src[y8 + 7] = (x7 - x1) >> 8;
-                }
-
-                // Vertical 1-D IDCT.
-                for (int x = 0; x < 8; x++)
-                {
-                    // Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
-                    // However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
-                    // we do not bother to check for the all-zero case.
-
-                    // Prescale.
-                    int y0 = (src[x] << 8) + 8192;
-                    int y1 = src[32 + x] << 8;
-                    int y2 = src[48 + x];
-                    int y3 = src[16 + x];
-                    int y4 = src[8 + x];
-                    int y5 = src[56 + x];
-                    int y6 = src[40 + x];
-                    int y7 = src[24 + x];
-
-                    // Stage 1.
-                    int y8 = (w7 * (y4 + y5)) + 4;
-                    y4 = (y8 + (w1mw7 * y4)) >> 3;
-                    y5 = (y8 - (w1pw7 * y5)) >> 3;
-                    y8 = (w3 * (y6 + y7)) + 4;
-                    y6 = (y8 - (w3mw5 * y6)) >> 3;
-                    y7 = (y8 - (w3pw5 * y7)) >> 3;
-
-                    // Stage 2.
-                    y8 = y0 + y1;
-                    y0 -= y1;
-                    y1 = (w6 * (y3 + y2)) + 4;
-                    y2 = (y1 - (w2pw6 * y2)) >> 3;
-                    y3 = (y1 + (w2mw6 * y3)) >> 3;
-                    y1 = y4 + y6;
-                    y4 -= y6;
-                    y6 = y5 + y7;
-                    y5 -= y7;
-
-                    // Stage 3.
-                    y7 = y8 + y3;
-                    y8 -= y3;
-                    y3 = y0 + y2;
-                    y0 -= y2;
-                    y2 = ((r2 * (y4 + y5)) + 128) >> 8;
-                    y4 = ((r2 * (y4 - y5)) + 128) >> 8;
-
-                    // Stage 4.
-                    src[x] = (y7 + y1) >> 14;
-                    src[8 + x] = (y3 + y2) >> 14;
-                    src[16 + x] = (y0 + y4) >> 14;
-                    src[24 + x] = (y8 + y6) >> 14;
-                    src[32 + x] = (y8 - y6) >> 14;
-                    src[40 + x] = (y0 - y4) >> 14;
-                    src[48 + x] = (y3 - y2) >> 14;
-                    src[56 + x] = (y7 - y1) >> 14;
-                }
-            }
-        }
-
-        /// <summary>
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L200
-        /// </summary>
-        /// <param name="y"></param>
-        /// <param name="x"></param>
-        private static void iDCT1Dllm_32f(Span<float> y, Span<float> x)
-        {
-            float a0, a1, a2, a3, b0, b1, b2, b3;
-            float z0, z1, z2, z3, z4;
-
-            //float r0 = 1.414214f;
-            float r1 = 1.387040f;
-            float r2 = 1.306563f;
-            float r3 = 1.175876f;
-            //float r4 = 1.000000f;
-            float r5 = 0.785695f;
-            float r6 = 0.541196f;
-            float r7 = 0.275899f;
-
-            z0 = y[1] + y[7];
-            z1 = y[3] + y[5];
-            z2 = y[3] + y[7];
-            z3 = y[1] + y[5];
-            z4 = (z0 + z1) * r3;
-
-            z0 = z0 * (-r3 + r7);
-            z1 = z1 * (-r3 - r1);
-            z2 = z2 * (-r3 - r5) + z4;
-            z3 = z3 * (-r3 + r5) + z4;
-
-            b3 = y[7] * (-r1 + r3 + r5 - r7) + z0 + z2;
-            b2 = y[5] * (r1 + r3 - r5 + r7) + z1 + z3;
-            b1 = y[3] * (r1 + r3 + r5 - r7) + z1 + z2;
-            b0 = y[1] * (r1 + r3 - r5 - r7) + z0 + z3;
-
-            z4 = (y[2] + y[6]) * r6;
-            z0 = y[0] + y[4];
-            z1 = y[0] - y[4];
-            z2 = z4 - y[6] * (r2 + r6);
-            z3 = z4 + y[2] * (r2 - r6);
-            a0 = z0 + z3;
-            a3 = z0 - z3;
-            a1 = z1 + z2;
-            a2 = z1 - z2;
-
-            x[0] = a0 + b0;
-            x[7] = a0 - b0;
-            x[1] = a1 + b1;
-            x[6] = a1 - b1;
-            x[2] = a2 + b2;
-            x[5] = a2 - b2;
-            x[3] = a3 + b3;
-            x[4] = a3 - b3;
-        }
-
-        /// <summary>
-        /// Original: https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
-        /// Applyies IDCT transformation on "s" copying transformed values to "d", using temporal block "temp"
-        /// </summary>
-        /// <param name="s"></param>
-        /// <param name="d"></param>
-        /// <param name="temp"></param>
-        internal static void iDCT2D_llm(Span<float> s, Span<float> d, Span<float> temp)
-        {
-            int j;
-
-            for (j = 0; j < 8; j++)
-            {
-                iDCT1Dllm_32f(s.Slice(j * 8), temp.Slice(j * 8));
-            }
-
-            Transpose8x8(temp, d);
-
-            for (j = 0; j < 8; j++)
-            {
-                iDCT1Dllm_32f(d.Slice(j * 8), temp.Slice(j * 8));
-            }
-
-            Transpose8x8(temp, d);
-
-            for (j = 0; j < 64; j++)
-            {
-                d[j] *= 0.125f;
-            }
-        }
-
-        /// <summary>
-        /// Original:
-        /// <see>
-        ///     <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
-        /// </see>
-        /// </summary>
-        /// <param name="s">Source</param>
-        /// <param name="d">Destination</param>
-        public static void fDCT2D8x4_32f(Span<float> s, Span<float> d)
-        {
-            Vector4 c0 = _mm_load_ps(s, 0);
-            Vector4 c1 = _mm_load_ps(s, 56);
-            Vector4 t0 = (c0 + c1);
-            Vector4 t7 = (c0 - c1);
-
-            c1 = _mm_load_ps(s, 48);
-            c0 = _mm_load_ps(s, 8);
-            Vector4 t1 = (c0 + c1);
-            Vector4 t6 = (c0 - c1);
-
-            c1 = _mm_load_ps(s, 40);
-            c0 = _mm_load_ps(s, 16);
-            Vector4 t2 = (c0 + c1);
-            Vector4 t5 = (c0 - c1);
-
-            c0 = _mm_load_ps(s, 24);
-            c1 = _mm_load_ps(s, 32);
-            Vector4 t3 = (c0 + c1);
-            Vector4 t4 = (c0 - c1);
-
-            /*
-            c1 = x[0]; c2 = x[7]; t0 = c1 + c2; t7 = c1 - c2;
-            c1 = x[1]; c2 = x[6]; t1 = c1 + c2; t6 = c1 - c2;
-            c1 = x[2]; c2 = x[5]; t2 = c1 + c2; t5 = c1 - c2;
-            c1 = x[3]; c2 = x[4]; t3 = c1 + c2; t4 = c1 - c2;
-            */
-
-            c0 = (t0 + t3);
-            Vector4 c3 = (t0 - t3);
-            c1 = (t1 + t2);
-            Vector4 c2 = (t1 - t2);
-
-            /*
-	        c0 = t0 + t3; c3 = t0 - t3;
-	        c1 = t1 + t2; c2 = t1 - t2;
-	        */
-
-            _mm_store_ps(d, 0, (c0 + c1));
-
-            _mm_store_ps(d, 32, (c0 - c1));
-
-            /*y[0] = c0 + c1;
-            y[4] = c0 - c1;*/
-
-            Vector4 w0 = new Vector4(0.541196f);
-            Vector4 w1 = new Vector4(1.306563f);
-
-            _mm_store_ps(d, 16, ((w0 * c2) + (w1 * c3)));
-
-            _mm_store_ps(d, 48, ((w0 * c3) - (w1 * c2)));
-            /*
-            y[2] = c2 * r[6] + c3 * r[2];
-            y[6] = c3 * r[6] - c2 * r[2];
-            */
-
-            w0 = new Vector4(1.175876f);
-            w1 = new Vector4(0.785695f);
-            c3 = ((w0 * t4) + (w1 * t7));
-            c0 = ((w0 * t7) - (w1 * t4));
-            /*
-            c3 = t4 * r[3] + t7 * r[5];
-            c0 = t7 * r[3] - t4 * r[5];
-            */
-
-            w0 = new Vector4(1.387040f);
-            w1 = new Vector4(0.275899f);
-            c2 = ((w0 * t5) + (w1 * t6));
-            c1 = ((w0 * t6) - (w1 * t5));
-            /*
-	        c2 = t5 * r[1] + t6 * r[7];
-	        c1 = t6 * r[1] - t5 * r[7];
-	        */
-
-            _mm_store_ps(d, 24, (c0 - c2));
-
-            _mm_store_ps(d, 40, (c3 - c1));
-            //y[5] = c3 - c1; y[3] = c0 - c2;
-
-            Vector4 invsqrt2 = new Vector4(0.707107f);
-            c0 = ((c0 + c2) * invsqrt2);
-            c3 = ((c3 + c1) * invsqrt2);
-            //c0 = (c0 + c2) * invsqrt2;
-            //c3 = (c3 + c1) * invsqrt2;
-
-            _mm_store_ps(d, 8, (c0 + c3));
-
-            _mm_store_ps(d, 56, (c0 - c3));
-            //y[1] = c0 + c3; y[7] = c0 - c3;
-
-            /*for(i = 0;i < 8;i++)
-            {
-            y[i] *= invsqrt2h;
-            }*/
-        }
-
-        public static void fDCT8x8_llm_sse(Span<float> s, Span<float> d, Span<float> temp)
-        {
-            Transpose8x8(s, temp);
-
-            fDCT2D8x4_32f(temp, d);
-
-            fDCT2D8x4_32f(temp.Slice(4), d.Slice(4));
-
-            Transpose8x8(d, temp);
-
-            fDCT2D8x4_32f(temp, d);
-
-            fDCT2D8x4_32f(temp.Slice(4), d.Slice(4));
-
-            Vector4 c = new Vector4(0.1250f);
-
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//0
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//1
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//2
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//3
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//4
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//5
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//6
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//7
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//8
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//9
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//10
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//11
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//12
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//13
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//14
-            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//15
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector4 _mm_load_ps(Span<float> src, int offset)
-        {
-            src = src.Slice(offset);
-            return new Vector4(src[0], src[1], src[2], src[3]);
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static void _mm_store_ps(Span<float> dest, int offset, Vector4 src)
-        {
-            dest = dest.Slice(offset);
-            dest[0] = src.X;
-            dest[1] = src.Y;
-            dest[2] = src.Z;
-            dest[3] = src.W;
-        }
-
-        private static readonly Vector4 _1_175876 = new Vector4(1.175876f);
-
-        private static readonly Vector4 _1_961571 = new Vector4(-1.961571f);
-
-        private static readonly Vector4 _0_390181 = new Vector4(-0.390181f);
-
-        private static readonly Vector4 _0_899976 = new Vector4(-0.899976f);
-
-        private static readonly Vector4 _2_562915 = new Vector4(-2.562915f);
-
-        private static readonly Vector4 _0_298631 = new Vector4(0.298631f);
-
-        private static readonly Vector4 _2_053120 = new Vector4(2.053120f);
-
-        private static readonly Vector4 _3_072711 = new Vector4(3.072711f);
-
-        private static readonly Vector4 _1_501321 = new Vector4(1.501321f);
-
-        private static readonly Vector4 _0_541196 = new Vector4(0.541196f);
-
-        private static readonly Vector4 _1_847759 = new Vector4(-1.847759f);
-
-        private static readonly Vector4 _0_765367 = new Vector4(0.765367f);
-
-        /// <summary>
-        /// Original:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// Does a part of the IDCT job on the given parts of the blocks
-        /// </summary>
-        /// <param name="y"></param>
-        /// <param name="x"></param>
-        internal static void iDCT2D8x4_32f(Span<float> y, Span<float> x)
-        {
-            /*
-	        float a0,a1,a2,a3,b0,b1,b2,b3; float z0,z1,z2,z3,z4; float r[8]; int i;
-	        for(i = 0;i < 8;i++){ r[i] = (float)(cos((double)i / 16.0 * M_PI) * M_SQRT2); }
-	        */
-            /*
-	        0: 1.414214
-	        1: 1.387040
-	        2: 1.306563
-	        3:
-	        4: 1.000000
-	        5: 0.785695
-	        6:
-	        7: 0.275899
-	        */
-
-            Vector4 my1 = _mm_load_ps(y, 8);
-            Vector4 my7 = _mm_load_ps(y, 56);
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = _mm_load_ps(y, 24);
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = _mm_load_ps(y, 40);
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = ((mz0 + mz1) * _1_175876);
-            //z0 = y[1] + y[7]; z1 = y[3] + y[5]; z2 = y[3] + y[7]; z3 = y[1] + y[5];
-            //z4 = (z0 + z1) * r[3];
-
-            mz2 = mz2 * _1_961571 + mz4;
-            mz3 = mz3 * _0_390181 + mz4;
-            mz0 = mz0 * _0_899976;
-            mz1 = mz1 * _2_562915;
-
-            /*
-            -0.899976
-            -2.562915
-            -1.961571
-            -0.390181
-            z0 = z0 * (-r[3] + r[7]);
-            z1 = z1 * (-r[3] - r[1]);
-            z2 = z2 * (-r[3] - r[5]) + z4;
-            z3 = z3 * (-r[3] + r[5]) + z4;*/
-
-            Vector4 mb3 = my7 * _0_298631 + mz0 + mz2;
-            Vector4 mb2 = my5 * _2_053120 + mz1 + mz3;
-            Vector4 mb1 = my3 * _3_072711 + mz1 + mz2;
-            Vector4 mb0 = my1 * _1_501321 + mz0 + mz3;
-
-            /*
-            0.298631
-            2.053120
-            3.072711
-            1.501321
-            b3 = y[7] * (-r[1] + r[3] + r[5] - r[7]) + z0 + z2;
-            b2 = y[5] * ( r[1] + r[3] - r[5] + r[7]) + z1 + z3;
-            b1 = y[3] * ( r[1] + r[3] + r[5] - r[7]) + z1 + z2;
-            b0 = y[1] * ( r[1] + r[3] - r[5] - r[7]) + z0 + z3;
-            */
-
-            Vector4 my2 = _mm_load_ps(y, 16);
-            Vector4 my6 = _mm_load_ps(y, 48);
-            mz4 = (my2 + my6) * _0_541196;
-            Vector4 my0 = _mm_load_ps(y, 0);
-            Vector4 my4 = _mm_load_ps(y, 32);
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + my6 * _1_847759;
-            mz3 = mz4 + my2 * _0_765367;
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-            /*
-	        1.847759
-	        0.765367
-	        z4 = (y[2] + y[6]) * r[6];
-	        z0 = y[0] + y[4]; z1 = y[0] - y[4];
-	        z2 = z4 - y[6] * (r[2] + r[6]);
-	        z3 = z4 + y[2] * (r[2] - r[6]);
-	        a0 = z0 + z3; a3 = z0 - z3;
-	        a1 = z1 + z2; a2 = z1 - z2;
-	        */
-
-            _mm_store_ps(x, 0, my0 + mb0);
-
-            _mm_store_ps(x, 56, my0 - mb0);
-
-            _mm_store_ps(x, 8, my1 + mb1);
-
-            _mm_store_ps(x, 48, my1 - mb1);
-
-            _mm_store_ps(x, 16, my2 + mb2);
-
-            _mm_store_ps(x, 40, my2 - mb2);
-
-            _mm_store_ps(x, 24, my3 + mb3);
-
-            _mm_store_ps(x, 32, my3 - mb3);
-            /*
-            x[0] = a0 + b0; x[7] = a0 - b0;
-            x[1] = a1 + b1; x[6] = a1 - b1;
-            x[2] = a2 + b2; x[5] = a2 - b2;
-            x[3] = a3 + b3; x[4] = a3 - b3;
-            for(i = 0;i < 8;i++){ x[i] *= 0.353554f; }
-            */
-        }
-
-        /// <summary>
-        /// Copies color values from block to the destination image buffer.
-        /// </summary>
-        /// <param name="block"></param>
-        /// <param name="buffer"></param>
-        /// <param name="stride"></param>
-        internal static unsafe void CopyColorsTo(ref Block8x8F block, Span<byte> buffer, int stride)
-        {
-            fixed (Block8x8F* p = &block)
-            {
-                float* b = (float*)p;
-
-                for (int y = 0; y < 8; y++)
-                {
-                    int y8 = y * 8;
-                    int yStride = y * stride;
-
-                    for (int x = 0; x < 8; x++)
-                    {
-                        float c = b[y8 + x];
-
-                        if (c < -128)
-                        {
-                            c = 0;
-                        }
-                        else if (c > 127)
-                        {
-                            c = 255;
-                        }
-                        else
-                        {
-                            c += 128;
-                        }
-
-                        buffer[yStride + x] = (byte)c;
-                    }
-                }
-            }
-        }
-
-        internal static void fDCT1Dllm_32f(Span<float> x, Span<float> y)
-        {
-            float t0, t1, t2, t3, t4, t5, t6, t7;
-            float c0, c1, c2, c3;
-            float[] r = new float[8];
-
-            //for(i = 0;i < 8;i++){ r[i] = (float)(cos((double)i / 16.0 * M_PI) * M_SQRT2); }
-            r[0] = 1.414214f;
-            r[1] = 1.387040f;
-            r[2] = 1.306563f;
-            r[3] = 1.175876f;
-            r[4] = 1.000000f;
-            r[5] = 0.785695f;
-            r[6] = 0.541196f;
-            r[7] = 0.275899f;
-
-            const float invsqrt2 = 0.707107f; //(float)(1.0f / M_SQRT2);
-            //const float invsqrt2h = 0.353554f; //invsqrt2*0.5f;
-
-            c1 = x[0];
-            c2 = x[7];
-            t0 = c1 + c2;
-            t7 = c1 - c2;
-            c1 = x[1];
-            c2 = x[6];
-            t1 = c1 + c2;
-            t6 = c1 - c2;
-            c1 = x[2];
-            c2 = x[5];
-            t2 = c1 + c2;
-            t5 = c1 - c2;
-            c1 = x[3];
-            c2 = x[4];
-            t3 = c1 + c2;
-            t4 = c1 - c2;
-
-            c0 = t0 + t3;
-            c3 = t0 - t3;
-            c1 = t1 + t2;
-            c2 = t1 - t2;
-
-            y[0] = c0 + c1;
-            y[4] = c0 - c1;
-            y[2] = c2 * r[6] + c3 * r[2];
-            y[6] = c3 * r[6] - c2 * r[2];
-
-            c3 = t4 * r[3] + t7 * r[5];
-            c0 = t7 * r[3] - t4 * r[5];
-            c2 = t5 * r[1] + t6 * r[7];
-            c1 = t6 * r[1] - t5 * r[7];
-
-            y[5] = c3 - c1;
-            y[3] = c0 - c2;
-            c0 = (c0 + c2) * invsqrt2;
-            c3 = (c3 + c1) * invsqrt2;
-            y[1] = c0 + c3;
-            y[7] = c0 - c3;
-        }
-
-        internal static void fDCT2D_llm(
-            Span<float> s,
-            Span<float> d,
-            Span<float> temp,
-            bool downscaleBy8 = false,
-            bool offsetSourceByNeg128 = false)
-        {
-            Span<float> sWorker = offsetSourceByNeg128 ? s.AddScalarToAllValues(-128f) : s;
-
-            for (int j = 0; j < 8; j++)
-            {
-                fDCT1Dllm_32f(sWorker.Slice(j * 8), temp.Slice(j * 8));
-            }
-
-            Transpose8x8(temp, d);
-
-            for (int j = 0; j < 8; j++)
-            {
-                fDCT1Dllm_32f(d.Slice(j * 8), temp.Slice(j * 8));
-            }
-
-            Transpose8x8(temp, d);
-
-            if (downscaleBy8)
-            {
-                for (int j = 0; j < 64; j++)
-                {
-                    d[j] *= 0.125f;
-                }
-            }
-        }
-
-        /// <summary>
-        /// Reference implementation to test <see cref="Block8x8F.UnzigDivRound"/>.
-        /// Rounding is done used an integer-based algorithm defined in <see cref="RationalRound"/>.
-        /// </summary>
-        /// <param name="src">The input block</param>
-        /// <param name="dest">The destination block of integers</param>
-        /// <param name="qt">The quantization table</param>
-        /// <param name="unzigPtr">Pointer to <see cref="UnzigData.Data"/> </param>
-        public static unsafe void UnZigDivRoundRational(Block8x8F* src, int* dest, Block8x8F* qt, int* unzigPtr)
-        {
-            float* s = (float*)src;
-            float* q = (float*)qt;
-
-            for (int zig = 0; zig < Block8x8F.Size; zig++)
-            {
-                int a = (int)s[unzigPtr[zig]];
-                int b = (int)q[zig];
-
-                int val = RationalRound(a, b);
-                dest[zig] = val;
-            }
-        }
-
-        /// <summary>
-        /// Rounds a rational number defined as dividend/divisor into an integer
-        /// </summary>
-        /// <param name="dividend">The dividend</param>
-        /// <param name="divisor">The divisior</param>
-        /// <returns></returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int RationalRound(int dividend, int divisor)
-        {
-            if (dividend >= 0)
-            {
-                return (dividend + (divisor >> 1)) / divisor;
-            }
-
-            return -((-dividend + (divisor >> 1)) / divisor);
-        }
-    }
-}
--- a/tests/ImageSharp.Tests/Formats/Jpg/ReferenceImplementationsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/ReferenceImplementationsTests.cs
@ -11,6 +11,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 {
    using System;

+    using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
+
    public class ReferenceImplementationsTests : JpegUtilityTestFixture
    {
        public ReferenceImplementationsTests(ITestOutputHelper output)
@ -28,12 +30,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            int[] intData = Create8x8RandomIntData(-200, 200, seed);
            Span<float> floatSrc = intData.ConvertAllToFloat();

-            ReferenceImplementations.IntegerReferenceDCT.TransformIDCTInplace(intData);
+            ReferenceImplementations.IntegerDCT.TransformIDCTInplace(intData);

            float[] dest = new float[64];
            float[] temp = new float[64];

-            ReferenceImplementations.iDCT2D_llm(floatSrc, dest, temp);
+            ReferenceImplementations.FastFloatingPointDCT.iDCT2D_llm(floatSrc, dest, temp);

            for (int i = 0; i < 64; i++)
            {
@ -54,14 +56,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg

            Span<int> block = original.AddScalarToAllValues(128);

-            ReferenceImplementations.IntegerReferenceDCT.TransformFDCTInplace(block);
+            ReferenceImplementations.IntegerDCT.TransformFDCTInplace(block);

            for (int i = 0; i < 64; i++)
            {
                block[i] /= 8;
            }

-            ReferenceImplementations.IntegerReferenceDCT.TransformIDCTInplace(block);
+            ReferenceImplementations.IntegerDCT.TransformIDCTInplace(block);

            for (int i = startAt; i < 64; i++)
            {
@ -84,8 +86,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            float[] dest = new float[64];
            float[] temp = new float[64];

-            ReferenceImplementations.fDCT2D_llm(src, dest, temp, true);
-            ReferenceImplementations.iDCT2D_llm(dest, src, temp);
+            ReferenceImplementations.FastFloatingPointDCT.fDCT2D_llm(src, dest, temp, true);
+            ReferenceImplementations.FastFloatingPointDCT.iDCT2D_llm(dest, src, temp);

            for (int i = startAt; i < 64; i++)
            {
@ -105,12 +107,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            int[] intData = Create8x8RandomIntData(-200, 200, seed);
            float[] floatSrc = intData.ConvertAllToFloat();

-            ReferenceImplementations.IntegerReferenceDCT.TransformFDCTInplace(intData);
+            ReferenceImplementations.IntegerDCT.TransformFDCTInplace(intData);

            float[] dest = new float[64];
            float[] temp = new float[64];

-            ReferenceImplementations.fDCT2D_llm(floatSrc, dest, temp, offsetSourceByNeg128: true);
+            ReferenceImplementations.FastFloatingPointDCT.fDCT2D_llm(floatSrc, dest, temp, offsetSourceByNeg128: true);

            for (int i = 0; i < 64; i++)
            {
--- a/tests/ImageSharp.Tests/Formats/Jpg/SpectralJpegTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/SpectralJpegTests.cs
@ -1,19 +1,15 @@
 // ReSharper disable InconsistentNaming
-namespace SixLabors.ImageSharp.Tests
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 {
    using System;
-    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
-    using System.Runtime.CompilerServices;

-    using SixLabors.ImageSharp.Formats;
    using SixLabors.ImageSharp.Formats.Jpeg;
-    using SixLabors.ImageSharp.Formats.Jpeg.Common;
    using SixLabors.ImageSharp.Formats.Jpeg.GolangPort;
    using SixLabors.ImageSharp.Formats.Jpeg.PdfJsPort;
    using SixLabors.ImageSharp.PixelFormats;
-    using SixLabors.ImageSharp.Tests.TestUtilities.ImageComparison;
+    using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;

    using Xunit;
    using Xunit.Abstractions;
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegUtilityTestFixture.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegUtilityTestFixture.cs
@ -1,16 +1,18 @@
 // Copyright (c) Six Labors and contributors.
 // Licensed under the Apache License, Version 2.0.

-using System;
-using System.Diagnostics;
-using System.Text;
-using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Utils;
-using Xunit.Abstractions;
+

 // ReSharper disable InconsistentNaming

-namespace SixLabors.ImageSharp.Tests
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
 {
+    using System;
+    using System.Diagnostics;
+    using System.Text;
+
+    using Xunit.Abstractions;
+
    public class JpegUtilityTestFixture : MeasureFixture
    {
        public JpegUtilityTestFixture(ITestOutputHelper output) : base(output)
@ -75,7 +77,7 @@ namespace SixLabors.ImageSharp.Tests
        }

        internal static float[] Create8x8RandomFloatData(int minValue, int maxValue, int seed = 42)
-            => Create8x8RandomIntData(minValue, maxValue, seed).ConvertAllToFloat();
+            => ImageSharp.Formats.Jpeg.GolangPort.Utils.SpanExtensions.ConvertAllToFloat(Create8x8RandomIntData(minValue, maxValue, seed));

        internal void Print8x8Data<T>(T[] data) => this.Print8x8Data(new Span<T>(data));

--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
@ -0,0 +1,184 @@
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
+{
+    using System;
+    using System.Linq;
+    using System.Numerics;
+
+    using SixLabors.ImageSharp.Formats.Jpeg.Common;
+    using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Components.Decoder;
+    using SixLabors.ImageSharp.Formats.Jpeg.PdfJsPort.Components;
+    using SixLabors.ImageSharp.Memory;
+    using SixLabors.Primitives;
+
+    internal static partial class LibJpegTools
+    {
+        public class ComponentData : IEquatable<ComponentData>, IJpegComponent
+        {
+            public ComponentData(int heightInBlocks, int widthInBlocks, int index)
+            {
+                this.HeightInBlocks = heightInBlocks;
+                this.WidthInBlocks = widthInBlocks;
+                this.Index = index;
+                this.Blocks = new Buffer2D<Block8x8>(this.WidthInBlocks, this.HeightInBlocks);
+            }
+
+            public Size Size => new Size(this.WidthInBlocks, this.HeightInBlocks);
+
+            public int Index { get; }
+
+            public int HeightInBlocks { get; }
+
+            public int WidthInBlocks { get; }
+
+            public Buffer2D<Block8x8> Blocks { get; private set; }
+
+            public short MinVal { get; private set; } = short.MaxValue;
+
+            public short MaxVal { get; private set; } = short.MinValue;
+            
+            internal void MakeBlock(short[] data, int y, int x)
+            {
+                this.MinVal = Math.Min((short)this.MinVal, data.Min());
+                this.MaxVal = Math.Max((short)this.MaxVal, data.Max());
+                this.Blocks[x, y] = new Block8x8(data);
+            }
+
+            public static ComponentData Load(FrameComponent c, int index)
+            {
+                var result = new ComponentData(
+                    c.BlocksPerColumnForMcu,
+                    c.BlocksPerLineForMcu,
+                    index
+                );
+
+                for (int y = 0; y < result.HeightInBlocks; y++)
+                {
+                    for (int x = 0; x < result.WidthInBlocks; x++)
+                    {
+                        short[] data = c.GetBlockBuffer(y, x).ToArray();
+                        result.MakeBlock(data, y, x);
+                    }
+                }
+
+                return result;
+            }
+
+            public static ComponentData Load(OldComponent c)
+            {
+                var result = new ComponentData(
+                    c.HeightInBlocks,
+                    c.WidthInBlocks,
+                    c.Index
+                );
+
+                for (int y = 0; y < result.HeightInBlocks; y++)
+                {
+                    for (int x = 0; x < result.WidthInBlocks; x++)
+                    {
+                        short[] data = c.GetBlockReference(x, y).ToArray();
+                        result.MakeBlock(data, y, x);
+                    }
+                }
+
+                return result;
+            }
+
+            public Image<Rgba32> CreateGrayScaleImage()
+            {
+                Image<Rgba32> result = new Image<Rgba32>(this.WidthInBlocks * 8, this.HeightInBlocks * 8);
+                
+                for (int by = 0; by < this.HeightInBlocks; by++)
+                {
+                    for (int bx = 0; bx < this.WidthInBlocks; bx++)
+                    {
+                        this.WriteToImage(bx, by, result);
+                    }
+                }
+                return result;
+            }
+
+            internal void WriteToImage(int bx, int by, Image<Rgba32> image)
+            {
+                Block8x8 block = this.Blocks[bx, by];
+                
+                for (int y = 0; y < 8; y++)
+                {
+                    for (int x = 0; x < 8; x++)
+                    {
+                        var val = this.GetBlockValue(block, x, y);
+
+                        Vector4 v = new Vector4(val, val, val, 1);
+                        Rgba32 color = default(Rgba32);
+                        color.PackFromVector4(v);
+
+                        int yy = by * 8 + y;
+                        int xx = bx * 8 + x;
+                        image[xx, yy] = color;
+                    }
+                }
+            }
+
+            internal float GetBlockValue(Block8x8 block, int x, int y)
+            {
+                float d = (this.MaxVal - this.MinVal);
+                float val = block.GetValueAt(x, y);
+                val -= this.MinVal;
+                val /= d;
+                return val;
+            }
+
+            public bool Equals(ComponentData other)
+            {
+                if (Object.ReferenceEquals(null, other)) return false;
+                if (Object.ReferenceEquals(this, other)) return true;
+                bool ok = this.Index == other.Index && this.HeightInBlocks == other.HeightInBlocks
+                          && this.WidthInBlocks == other.WidthInBlocks;
+                //&& this.MinVal == other.MinVal
+                //&& this.MaxVal == other.MaxVal;
+                if (!ok) return false;
+
+                for (int y = 0; y < this.HeightInBlocks; y++)
+                {
+                    for (int x = 0; x < this.WidthInBlocks; x++)
+                    {
+                        Block8x8 a = this.Blocks[x, y];
+                        Block8x8 b = other.Blocks[x, y];
+                        if (!a.Equals(b)) return false;
+                    }
+                }
+                return true;
+            }
+
+            public override bool Equals(object obj)
+            {
+                if (Object.ReferenceEquals(null, obj)) return false;
+                if (Object.ReferenceEquals(this, obj)) return true;
+                if (obj.GetType() != this.GetType()) return false;
+                return this.Equals((ComponentData)obj);
+            }
+
+            public override int GetHashCode()
+            {
+                unchecked
+                {
+                    var hashCode = this.Index;
+                    hashCode = (hashCode * 397) ^ this.HeightInBlocks;
+                    hashCode = (hashCode * 397) ^ this.WidthInBlocks;
+                    hashCode = (hashCode * 397) ^ this.MinVal.GetHashCode();
+                    hashCode = (hashCode * 397) ^ this.MaxVal.GetHashCode();
+                    return hashCode;
+                }
+            }
+
+            public static bool operator ==(ComponentData left, ComponentData right)
+            {
+                return Object.Equals(left, right);
+            }
+
+            public static bool operator !=(ComponentData left, ComponentData right)
+            {
+                return !Object.Equals(left, right);
+            }
+        }
+    }
+}
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.SpectralData.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.SpectralData.cs
@ -0,0 +1,146 @@
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
+{
+    using System;
+    using System.Linq;
+    using System.Numerics;
+
+    using SixLabors.ImageSharp.Formats.Jpeg.Common;
+    using SixLabors.ImageSharp.Formats.Jpeg.GolangPort;
+    using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Components.Decoder;
+    using SixLabors.ImageSharp.Formats.Jpeg.PdfJsPort;
+    using SixLabors.ImageSharp.Formats.Jpeg.PdfJsPort.Components;
+
+    internal static partial class LibJpegTools
+    {
+        public class SpectralData : IEquatable<SpectralData>
+        {
+            public int ComponentCount { get; private set; }
+
+            public LibJpegTools.ComponentData[] Components { get; private set; }
+
+            internal SpectralData(LibJpegTools.ComponentData[] components)
+            {
+                this.ComponentCount = components.Length;
+                this.Components = components;
+            }
+            
+            public static SpectralData LoadFromImageSharpDecoder(JpegDecoderCore decoder)
+            {
+                FrameComponent[] srcComponents = decoder.Frame.Components;
+                LibJpegTools.ComponentData[] destComponents = srcComponents.Select(LibJpegTools.ComponentData.Load).ToArray();
+
+                return new SpectralData(destComponents);
+            }
+
+            public static SpectralData LoadFromImageSharpDecoder(OldJpegDecoderCore decoder)
+            {
+                OldComponent[] srcComponents = decoder.Components;
+                LibJpegTools.ComponentData[] destComponents = srcComponents.Select(LibJpegTools.ComponentData.Load).ToArray();
+
+                return new SpectralData(destComponents);
+            }
+
+            public Image<Rgba32> TryCreateRGBSpectralImage()
+            {
+                if (this.ComponentCount != 3) return null;
+
+                LibJpegTools.ComponentData c0 = this.Components[0];
+                LibJpegTools.ComponentData c1 = this.Components[1];
+                LibJpegTools.ComponentData c2 = this.Components[2];
+
+                if (c0.Size != c1.Size || c1.Size != c2.Size)
+                {
+                    return null;
+                }
+
+                Image<Rgba32> result = new Image<Rgba32>(c0.WidthInBlocks * 8, c0.HeightInBlocks * 8);
+
+                for (int by = 0; by < c0.HeightInBlocks; by++)
+                {
+                    for (int bx = 0; bx < c0.WidthInBlocks; bx++)
+                    {
+                        this.WriteToImage(bx, by, result);
+                    }
+                }
+                return result;
+            }
+
+            internal void WriteToImage(int bx, int by, Image<Rgba32> image)
+            {
+                LibJpegTools.ComponentData c0 = this.Components[0];
+                LibJpegTools.ComponentData c1 = this.Components[1];
+                LibJpegTools.ComponentData c2 = this.Components[2];
+
+                Block8x8 block0 = c0.Blocks[bx, by];
+                Block8x8 block1 = c1.Blocks[bx, by];
+                Block8x8 block2 = c2.Blocks[bx, by];
+
+                float d0 = (c0.MaxVal - c0.MinVal);
+                float d1 = (c1.MaxVal - c1.MinVal);
+                float d2 = (c2.MaxVal - c2.MinVal);
+
+                for (int y = 0; y < 8; y++)
+                {
+                    for (int x = 0; x < 8; x++)
+                    {
+                        float val0 = c0.GetBlockValue(block0, x, y);
+                        float val1 = c0.GetBlockValue(block1, x, y);
+                        float val2 = c0.GetBlockValue(block2, x, y);
+
+                        Vector4 v = new Vector4(val0, val1, val2, 1);
+                        Rgba32 color = default(Rgba32);
+                        color.PackFromVector4(v);
+
+                        int yy = by * 8 + y;
+                        int xx = bx * 8 + x;
+                        image[xx, yy] = color;
+                    }
+                }
+            }
+
+            public bool Equals(SpectralData other)
+            {
+                if (Object.ReferenceEquals(null, other)) return false;
+                if (Object.ReferenceEquals(this, other)) return true;
+                if (this.ComponentCount != other.ComponentCount)
+                {
+                    return false;
+                }
+
+                for (int i = 0; i < this.ComponentCount; i++)
+                {
+                    LibJpegTools.ComponentData a = this.Components[i];
+                    LibJpegTools.ComponentData b = other.Components[i];
+                    if (!a.Equals(b)) return false;
+                }
+                return true;
+            }
+
+            public override bool Equals(object obj)
+            {
+                if (Object.ReferenceEquals(null, obj)) return false;
+                if (Object.ReferenceEquals(this, obj)) return true;
+                if (obj.GetType() != this.GetType()) return false;
+                return this.Equals((SpectralData)obj);
+            }
+
+            public override int GetHashCode()
+            {
+                unchecked
+                {
+                    return (this.ComponentCount * 397) ^ (this.Components != null ? this.Components[0].GetHashCode() : 0);
+                }
+            }
+
+            public static bool operator ==(SpectralData left, SpectralData right)
+            {
+                return Object.Equals(left, right);
+            }
+
+            public static bool operator !=(SpectralData left, SpectralData right)
+            {
+                return !Object.Equals(left, right);
+            }
+        }
+    }
+}
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.cs
@ -0,0 +1,113 @@
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
+{
+    using System;
+    using System.Diagnostics;
+    using System.IO;
+    using System.Numerics;
+    using System.Reflection;
+
+    using SixLabors.ImageSharp.Formats.Jpeg.Common;
+
+    internal static partial class LibJpegTools
+    {
+        public static (double total, double average) CalculateDifference(ComponentData expected, ComponentData actual)
+        {
+            BigInteger totalDiff = 0;
+            if (actual.WidthInBlocks < expected.WidthInBlocks)
+            {
+                throw new Exception("actual.WidthInBlocks < expected.WidthInBlocks");
+            }
+
+            if (actual.HeightInBlocks < expected.HeightInBlocks)
+            {
+                throw new Exception("actual.HeightInBlocks < expected.HeightInBlocks");
+            }
+
+            int w = expected.WidthInBlocks;
+            int h = expected.HeightInBlocks;
+            for (int y = 0; y < h; y++)
+            {
+                for (int x = 0; x < w; x++)
+                {
+                    Block8x8 aa = expected.Blocks[x, y];
+                    Block8x8 bb = actual.Blocks[x, y];
+
+                    long diff = Block8x8.TotalDifference(ref aa, ref bb);
+                    totalDiff += diff;
+                }
+            }
+            
+            int count = w * h;
+            double total = (double)totalDiff;
+            double average = (double)totalDiff / (count * Block8x8.Size);
+            return (total, average);
+        }
+
+        private static string DumpToolFullPath => Path.Combine(
+            TestEnvironment.ToolsDirectoryFullPath,
+            @"jpeg\dump-jpeg-coeffs.exe");
+
+        public static void RunDumpJpegCoeffsTool(string sourceFile, string destFile)
+        {
+            string args = $@"""{sourceFile}"" ""{destFile}""";
+            var process = Process.Start(DumpToolFullPath, args);
+            process.WaitForExit();
+        }
+
+        public static SpectralData ExtractSpectralData(string inputFile)
+        {
+            TestFile testFile = TestFile.Create(inputFile);
+
+            string outDir = TestEnvironment.CreateOutputDirectory(".Temp", $"JpegCoeffs");
+            string fn = $"{Path.GetFileName(inputFile)}-{new Random().Next(1000)}.dctcoeffs";
+            string coeffFileFullPath = Path.Combine(outDir, fn);
+
+            try
+            {
+                RunDumpJpegCoeffsTool(testFile.FullPath, coeffFileFullPath);
+                
+                using (var dumpStream = new FileStream(coeffFileFullPath, FileMode.Open))
+                using (var rdr = new BinaryReader(dumpStream))
+                {
+                    int componentCount = rdr.ReadInt16();
+                    ComponentData[] result = new ComponentData[componentCount];
+
+                    for (int i = 0; i < componentCount; i++)
+                    {
+                        int widthInBlocks = rdr.ReadInt16();
+                        int heightInBlocks = rdr.ReadInt16();
+                        ComponentData resultComponent = new ComponentData(heightInBlocks, widthInBlocks, i);
+                        result[i] = resultComponent;
+                    }
+
+                    byte[] buffer = new byte[64*sizeof(short)];
+
+                    for (int i = 0; i < result.Length; i++)
+                    {
+                        ComponentData c = result[i];
+
+                        for (int y = 0; y < c.HeightInBlocks; y++)
+                        {
+                            for (int x = 0; x < c.WidthInBlocks; x++)
+                            {
+                                rdr.Read(buffer, 0, buffer.Length);
+
+                                short[] block = buffer.AsSpan().NonPortableCast<byte, short>().ToArray();
+                                c.MakeBlock(block, y, x);
+                            }
+                        }
+                    }
+
+                    return new SpectralData(result);
+                }
+            }
+            finally
+            {
+                if (File.Exists(coeffFileFullPath))
+                {
+                    File.Delete(coeffFileFullPath);
+                }
+            }
+        }
+    }
+}
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.FastFloatingPointDCT.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.FastFloatingPointDCT.cs
@ -0,0 +1,490 @@
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
+{
+    using System;
+    using System.Numerics;
+    using System.Runtime.CompilerServices;
+
+    using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Utils;
+
+    internal static partial class ReferenceImplementations
+    {
+        internal static class FastFloatingPointDCT
+        {
+            /// <summary>
+            /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L200
+            /// </summary>
+            /// <param name="y"></param>
+            /// <param name="x"></param>
+            private static void iDCT1Dllm_32f(Span<float> y, Span<float> x)
+            {
+                float a0, a1, a2, a3, b0, b1, b2, b3;
+                float z0, z1, z2, z3, z4;
+
+                //float r0 = 1.414214f;
+                float r1 = 1.387040f;
+                float r2 = 1.306563f;
+                float r3 = 1.175876f;
+                //float r4 = 1.000000f;
+                float r5 = 0.785695f;
+                float r6 = 0.541196f;
+                float r7 = 0.275899f;
+
+                z0 = y[1] + y[7];
+                z1 = y[3] + y[5];
+                z2 = y[3] + y[7];
+                z3 = y[1] + y[5];
+                z4 = (z0 + z1) * r3;
+
+                z0 = z0 * (-r3 + r7);
+                z1 = z1 * (-r3 - r1);
+                z2 = z2 * (-r3 - r5) + z4;
+                z3 = z3 * (-r3 + r5) + z4;
+
+                b3 = y[7] * (-r1 + r3 + r5 - r7) + z0 + z2;
+                b2 = y[5] * (r1 + r3 - r5 + r7) + z1 + z3;
+                b1 = y[3] * (r1 + r3 + r5 - r7) + z1 + z2;
+                b0 = y[1] * (r1 + r3 - r5 - r7) + z0 + z3;
+
+                z4 = (y[2] + y[6]) * r6;
+                z0 = y[0] + y[4];
+                z1 = y[0] - y[4];
+                z2 = z4 - y[6] * (r2 + r6);
+                z3 = z4 + y[2] * (r2 - r6);
+                a0 = z0 + z3;
+                a3 = z0 - z3;
+                a1 = z1 + z2;
+                a2 = z1 - z2;
+
+                x[0] = a0 + b0;
+                x[7] = a0 - b0;
+                x[1] = a1 + b1;
+                x[6] = a1 - b1;
+                x[2] = a2 + b2;
+                x[5] = a2 - b2;
+                x[3] = a3 + b3;
+                x[4] = a3 - b3;
+            }
+
+            /// <summary>
+            /// Original: https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
+            /// Applyies IDCT transformation on "s" copying transformed values to "d", using temporal block "temp"
+            /// </summary>
+            /// <param name="s"></param>
+            /// <param name="d"></param>
+            /// <param name="temp"></param>
+            internal static void iDCT2D_llm(Span<float> s, Span<float> d, Span<float> temp)
+            {
+                int j;
+
+                for (j = 0; j < 8; j++)
+                {
+                    iDCT1Dllm_32f(s.Slice(j * 8), temp.Slice(j * 8));
+                }
+
+                ReferenceImplementations.Transpose8x8(temp, d);
+
+                for (j = 0; j < 8; j++)
+                {
+                    iDCT1Dllm_32f(d.Slice(j * 8), temp.Slice(j * 8));
+                }
+
+                ReferenceImplementations.Transpose8x8(temp, d);
+
+                for (j = 0; j < 64; j++)
+                {
+                    d[j] *= 0.125f;
+                }
+            }
+
+            /// <summary>
+            /// Original:
+            /// <see>
+            ///     <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
+            /// </see>
+            /// </summary>
+            /// <param name="s">Source</param>
+            /// <param name="d">Destination</param>
+            public static void fDCT2D8x4_32f(Span<float> s, Span<float> d)
+            {
+                Vector4 c0 = _mm_load_ps(s, 0);
+                Vector4 c1 = _mm_load_ps(s, 56);
+                Vector4 t0 = (c0 + c1);
+                Vector4 t7 = (c0 - c1);
+
+                c1 = _mm_load_ps(s, 48);
+                c0 = _mm_load_ps(s, 8);
+                Vector4 t1 = (c0 + c1);
+                Vector4 t6 = (c0 - c1);
+
+                c1 = _mm_load_ps(s, 40);
+                c0 = _mm_load_ps(s, 16);
+                Vector4 t2 = (c0 + c1);
+                Vector4 t5 = (c0 - c1);
+
+                c0 = _mm_load_ps(s, 24);
+                c1 = _mm_load_ps(s, 32);
+                Vector4 t3 = (c0 + c1);
+                Vector4 t4 = (c0 - c1);
+
+                /*
+                c1 = x[0]; c2 = x[7]; t0 = c1 + c2; t7 = c1 - c2;
+                c1 = x[1]; c2 = x[6]; t1 = c1 + c2; t6 = c1 - c2;
+                c1 = x[2]; c2 = x[5]; t2 = c1 + c2; t5 = c1 - c2;
+                c1 = x[3]; c2 = x[4]; t3 = c1 + c2; t4 = c1 - c2;
+                */
+
+                c0 = (t0 + t3);
+                Vector4 c3 = (t0 - t3);
+                c1 = (t1 + t2);
+                Vector4 c2 = (t1 - t2);
+
+                /*
+                c0 = t0 + t3; c3 = t0 - t3;
+                c1 = t1 + t2; c2 = t1 - t2;
+                */
+
+                _mm_store_ps(d, 0, (c0 + c1));
+
+                _mm_store_ps(d, 32, (c0 - c1));
+
+                /*y[0] = c0 + c1;
+                y[4] = c0 - c1;*/
+
+                Vector4 w0 = new Vector4(0.541196f);
+                Vector4 w1 = new Vector4(1.306563f);
+
+                _mm_store_ps(d, 16, ((w0 * c2) + (w1 * c3)));
+
+                _mm_store_ps(d, 48, ((w0 * c3) - (w1 * c2)));
+                /*
+                y[2] = c2 * r[6] + c3 * r[2];
+                y[6] = c3 * r[6] - c2 * r[2];
+                */
+
+                w0 = new Vector4(1.175876f);
+                w1 = new Vector4(0.785695f);
+                c3 = ((w0 * t4) + (w1 * t7));
+                c0 = ((w0 * t7) - (w1 * t4));
+                /*
+                c3 = t4 * r[3] + t7 * r[5];
+                c0 = t7 * r[3] - t4 * r[5];
+                */
+
+                w0 = new Vector4(1.387040f);
+                w1 = new Vector4(0.275899f);
+                c2 = ((w0 * t5) + (w1 * t6));
+                c1 = ((w0 * t6) - (w1 * t5));
+                /*
+                c2 = t5 * r[1] + t6 * r[7];
+                c1 = t6 * r[1] - t5 * r[7];
+                */
+
+                _mm_store_ps(d, 24, (c0 - c2));
+
+                _mm_store_ps(d, 40, (c3 - c1));
+                //y[5] = c3 - c1; y[3] = c0 - c2;
+
+                Vector4 invsqrt2 = new Vector4(0.707107f);
+                c0 = ((c0 + c2) * invsqrt2);
+                c3 = ((c3 + c1) * invsqrt2);
+                //c0 = (c0 + c2) * invsqrt2;
+                //c3 = (c3 + c1) * invsqrt2;
+
+                _mm_store_ps(d, 8, (c0 + c3));
+
+                _mm_store_ps(d, 56, (c0 - c3));
+                //y[1] = c0 + c3; y[7] = c0 - c3;
+
+                /*for(i = 0;i < 8;i++)
+                {
+                y[i] *= invsqrt2h;
+                }*/
+            }
+
+            public static void fDCT8x8_llm_sse(Span<float> s, Span<float> d, Span<float> temp)
+            {
+                ReferenceImplementations.Transpose8x8(s, temp);
+
+                fDCT2D8x4_32f(temp, d);
+
+                fDCT2D8x4_32f(temp.Slice(4), d.Slice(4));
+
+                ReferenceImplementations.Transpose8x8(d, temp);
+
+                fDCT2D8x4_32f(temp, d);
+
+                fDCT2D8x4_32f(temp.Slice(4), d.Slice(4));
+
+                Vector4 c = new Vector4(0.1250f);
+
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//0
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//1
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//2
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//3
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//4
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//5
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//6
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//7
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//8
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//9
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//10
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//11
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//12
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//13
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//14
+                _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d = d.Slice(4);//15
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private static Vector4 _mm_load_ps(Span<float> src, int offset)
+            {
+                src = src.Slice(offset);
+                return new Vector4(src[0], src[1], src[2], src[3]);
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private static void _mm_store_ps(Span<float> dest, int offset, Vector4 src)
+            {
+                dest = dest.Slice(offset);
+                dest[0] = src.X;
+                dest[1] = src.Y;
+                dest[2] = src.Z;
+                dest[3] = src.W;
+            }
+
+            private static readonly Vector4 _1_175876 = new Vector4(1.175876f);
+
+            private static readonly Vector4 _1_961571 = new Vector4(-1.961571f);
+
+            private static readonly Vector4 _0_390181 = new Vector4(-0.390181f);
+
+            private static readonly Vector4 _0_899976 = new Vector4(-0.899976f);
+
+            private static readonly Vector4 _2_562915 = new Vector4(-2.562915f);
+
+            private static readonly Vector4 _0_298631 = new Vector4(0.298631f);
+
+            private static readonly Vector4 _2_053120 = new Vector4(2.053120f);
+
+            private static readonly Vector4 _3_072711 = new Vector4(3.072711f);
+
+            private static readonly Vector4 _1_501321 = new Vector4(1.501321f);
+
+            private static readonly Vector4 _0_541196 = new Vector4(0.541196f);
+
+            private static readonly Vector4 _1_847759 = new Vector4(-1.847759f);
+
+            private static readonly Vector4 _0_765367 = new Vector4(0.765367f);
+
+            /// <summary>
+            /// Original:
+            /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+            /// Does a part of the IDCT job on the given parts of the blocks
+            /// </summary>
+            /// <param name="y"></param>
+            /// <param name="x"></param>
+            internal static void iDCT2D8x4_32f(Span<float> y, Span<float> x)
+            {
+                /*
+                float a0,a1,a2,a3,b0,b1,b2,b3; float z0,z1,z2,z3,z4; float r[8]; int i;
+                for(i = 0;i < 8;i++){ r[i] = (float)(cos((double)i / 16.0 * M_PI) * M_SQRT2); }
+                */
+                /*
+                0: 1.414214
+                1: 1.387040
+                2: 1.306563
+                3:
+                4: 1.000000
+                5: 0.785695
+                6:
+                7: 0.275899
+                */
+
+                Vector4 my1 = _mm_load_ps(y, 8);
+                Vector4 my7 = _mm_load_ps(y, 56);
+                Vector4 mz0 = my1 + my7;
+
+                Vector4 my3 = _mm_load_ps(y, 24);
+                Vector4 mz2 = my3 + my7;
+                Vector4 my5 = _mm_load_ps(y, 40);
+                Vector4 mz1 = my3 + my5;
+                Vector4 mz3 = my1 + my5;
+
+                Vector4 mz4 = ((mz0 + mz1) * _1_175876);
+                //z0 = y[1] + y[7]; z1 = y[3] + y[5]; z2 = y[3] + y[7]; z3 = y[1] + y[5];
+                //z4 = (z0 + z1) * r[3];
+
+                mz2 = mz2 * _1_961571 + mz4;
+                mz3 = mz3 * _0_390181 + mz4;
+                mz0 = mz0 * _0_899976;
+                mz1 = mz1 * _2_562915;
+
+                /*
+                -0.899976
+                -2.562915
+                -1.961571
+                -0.390181
+                z0 = z0 * (-r[3] + r[7]);
+                z1 = z1 * (-r[3] - r[1]);
+                z2 = z2 * (-r[3] - r[5]) + z4;
+                z3 = z3 * (-r[3] + r[5]) + z4;*/
+
+                Vector4 mb3 = my7 * _0_298631 + mz0 + mz2;
+                Vector4 mb2 = my5 * _2_053120 + mz1 + mz3;
+                Vector4 mb1 = my3 * _3_072711 + mz1 + mz2;
+                Vector4 mb0 = my1 * _1_501321 + mz0 + mz3;
+
+                /*
+                0.298631
+                2.053120
+                3.072711
+                1.501321
+                b3 = y[7] * (-r[1] + r[3] + r[5] - r[7]) + z0 + z2;
+                b2 = y[5] * ( r[1] + r[3] - r[5] + r[7]) + z1 + z3;
+                b1 = y[3] * ( r[1] + r[3] + r[5] - r[7]) + z1 + z2;
+                b0 = y[1] * ( r[1] + r[3] - r[5] - r[7]) + z0 + z3;
+                */
+
+                Vector4 my2 = _mm_load_ps(y, 16);
+                Vector4 my6 = _mm_load_ps(y, 48);
+                mz4 = (my2 + my6) * _0_541196;
+                Vector4 my0 = _mm_load_ps(y, 0);
+                Vector4 my4 = _mm_load_ps(y, 32);
+                mz0 = my0 + my4;
+                mz1 = my0 - my4;
+
+                mz2 = mz4 + my6 * _1_847759;
+                mz3 = mz4 + my2 * _0_765367;
+
+                my0 = mz0 + mz3;
+                my3 = mz0 - mz3;
+                my1 = mz1 + mz2;
+                my2 = mz1 - mz2;
+                /*
+                1.847759
+                0.765367
+                z4 = (y[2] + y[6]) * r[6];
+                z0 = y[0] + y[4]; z1 = y[0] - y[4];
+                z2 = z4 - y[6] * (r[2] + r[6]);
+                z3 = z4 + y[2] * (r[2] - r[6]);
+                a0 = z0 + z3; a3 = z0 - z3;
+                a1 = z1 + z2; a2 = z1 - z2;
+                */
+
+                _mm_store_ps(x, 0, my0 + mb0);
+
+                _mm_store_ps(x, 56, my0 - mb0);
+
+                _mm_store_ps(x, 8, my1 + mb1);
+
+                _mm_store_ps(x, 48, my1 - mb1);
+
+                _mm_store_ps(x, 16, my2 + mb2);
+
+                _mm_store_ps(x, 40, my2 - mb2);
+
+                _mm_store_ps(x, 24, my3 + mb3);
+
+                _mm_store_ps(x, 32, my3 - mb3);
+                /*
+                x[0] = a0 + b0; x[7] = a0 - b0;
+                x[1] = a1 + b1; x[6] = a1 - b1;
+                x[2] = a2 + b2; x[5] = a2 - b2;
+                x[3] = a3 + b3; x[4] = a3 - b3;
+                for(i = 0;i < 8;i++){ x[i] *= 0.353554f; }
+                */
+            }
+
+            internal static void fDCT1Dllm_32f(Span<float> x, Span<float> y)
+            {
+                float t0, t1, t2, t3, t4, t5, t6, t7;
+                float c0, c1, c2, c3;
+                float[] r = new float[8];
+
+                //for(i = 0;i < 8;i++){ r[i] = (float)(cos((double)i / 16.0 * M_PI) * M_SQRT2); }
+                r[0] = 1.414214f;
+                r[1] = 1.387040f;
+                r[2] = 1.306563f;
+                r[3] = 1.175876f;
+                r[4] = 1.000000f;
+                r[5] = 0.785695f;
+                r[6] = 0.541196f;
+                r[7] = 0.275899f;
+
+                const float invsqrt2 = 0.707107f; //(float)(1.0f / M_SQRT2);
+                //const float invsqrt2h = 0.353554f; //invsqrt2*0.5f;
+
+                c1 = x[0];
+                c2 = x[7];
+                t0 = c1 + c2;
+                t7 = c1 - c2;
+                c1 = x[1];
+                c2 = x[6];
+                t1 = c1 + c2;
+                t6 = c1 - c2;
+                c1 = x[2];
+                c2 = x[5];
+                t2 = c1 + c2;
+                t5 = c1 - c2;
+                c1 = x[3];
+                c2 = x[4];
+                t3 = c1 + c2;
+                t4 = c1 - c2;
+
+                c0 = t0 + t3;
+                c3 = t0 - t3;
+                c1 = t1 + t2;
+                c2 = t1 - t2;
+
+                y[0] = c0 + c1;
+                y[4] = c0 - c1;
+                y[2] = c2 * r[6] + c3 * r[2];
+                y[6] = c3 * r[6] - c2 * r[2];
+
+                c3 = t4 * r[3] + t7 * r[5];
+                c0 = t7 * r[3] - t4 * r[5];
+                c2 = t5 * r[1] + t6 * r[7];
+                c1 = t6 * r[1] - t5 * r[7];
+
+                y[5] = c3 - c1;
+                y[3] = c0 - c2;
+                c0 = (c0 + c2) * invsqrt2;
+                c3 = (c3 + c1) * invsqrt2;
+                y[1] = c0 + c3;
+                y[7] = c0 - c3;
+            }
+
+            internal static void fDCT2D_llm(
+                Span<float> s,
+                Span<float> d,
+                Span<float> temp,
+                bool downscaleBy8 = false,
+                bool offsetSourceByNeg128 = false)
+            {
+                Span<float> sWorker = offsetSourceByNeg128 ? s.AddScalarToAllValues(-128f) : s;
+
+                for (int j = 0; j < 8; j++)
+                {
+                    fDCT1Dllm_32f(sWorker.Slice(j * 8), temp.Slice(j * 8));
+                }
+
+                ReferenceImplementations.Transpose8x8(temp, d);
+
+                for (int j = 0; j < 8; j++)
+                {
+                    fDCT1Dllm_32f(d.Slice(j * 8), temp.Slice(j * 8));
+                }
+
+                ReferenceImplementations.Transpose8x8(temp, d);
+
+                if (downscaleBy8)
+                {
+                    for (int j = 0; j < 64; j++)
+                    {
+                        d[j] *= 0.125f;
+                    }
+                }
+            }
+        }
+    }
+}
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.IntegerDCT.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.IntegerDCT.cs
@ -0,0 +1,314 @@
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
+{
+    using System;
+
+    internal static partial class ReferenceImplementations
+    {
+        /// <summary>
+        /// The "original" libjpeg/golang based DCT implementation is used as reference implementation for tests.
+        /// </summary>
+        public static class IntegerDCT
+        {
+            private const int fix_0_298631336 = 2446;
+            private const int fix_0_390180644 = 3196;
+            private const int fix_0_541196100 = 4433;
+            private const int fix_0_765366865 = 6270;
+            private const int fix_0_899976223 = 7373;
+            private const int fix_1_175875602 = 9633;
+            private const int fix_1_501321110 = 12299;
+            private const int fix_1_847759065 = 15137;
+            private const int fix_1_961570560 = 16069;
+            private const int fix_2_053119869 = 16819;
+            private const int fix_2_562915447 = 20995;
+            private const int fix_3_072711026 = 25172;
+
+            /// <summary>
+            /// The number of bits
+            /// </summary>
+            private const int Bits = 13;
+
+            /// <summary>
+            /// The number of bits to shift by on the first pass.
+            /// </summary>
+            private const int Pass1Bits = 2;
+
+            /// <summary>
+            /// The value to shift by
+            /// </summary>
+            private const int CenterJSample = 128;
+
+            /// <summary>
+            /// Performs a forward DCT on an 8x8 block of coefficients, including a level shift.
+            /// Leave results scaled up by an overall factor of 8.
+            /// </summary>
+            /// <param name="block">The block of coefficients.</param>
+            public static void TransformFDCTInplace(Span<int> block)
+            {
+                // Pass 1: process rows.
+                for (int y = 0; y < 8; y++)
+                {
+                    int y8 = y * 8;
+
+                    int x0 = block[y8];
+                    int x1 = block[y8 + 1];
+                    int x2 = block[y8 + 2];
+                    int x3 = block[y8 + 3];
+                    int x4 = block[y8 + 4];
+                    int x5 = block[y8 + 5];
+                    int x6 = block[y8 + 6];
+                    int x7 = block[y8 + 7];
+
+                    int tmp0 = x0 + x7;
+                    int tmp1 = x1 + x6;
+                    int tmp2 = x2 + x5;
+                    int tmp3 = x3 + x4;
+
+                    int tmp10 = tmp0 + tmp3;
+                    int tmp12 = tmp0 - tmp3;
+                    int tmp11 = tmp1 + tmp2;
+                    int tmp13 = tmp1 - tmp2;
+
+                    tmp0 = x0 - x7;
+                    tmp1 = x1 - x6;
+                    tmp2 = x2 - x5;
+                    tmp3 = x3 - x4;
+
+                    block[y8] = (tmp10 + tmp11 - (8 * CenterJSample)) << Pass1Bits;
+                    block[y8 + 4] = (tmp10 - tmp11) << Pass1Bits;
+                    int z1 = (tmp12 + tmp13) * fix_0_541196100;
+                    z1 += 1 << (Bits - Pass1Bits - 1);
+                    block[y8 + 2] = (z1 + (tmp12 * fix_0_765366865)) >> (Bits - Pass1Bits);
+                    block[y8 + 6] = (z1 - (tmp13 * fix_1_847759065)) >> (Bits - Pass1Bits);
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp0 + tmp2;
+                    tmp13 = tmp1 + tmp3;
+                    z1 = (tmp12 + tmp13) * fix_1_175875602;
+                    z1 += 1 << (Bits - Pass1Bits - 1);
+                    tmp0 = tmp0 * fix_1_501321110;
+                    tmp1 = tmp1 * fix_3_072711026;
+                    tmp2 = tmp2 * fix_2_053119869;
+                    tmp3 = tmp3 * fix_0_298631336;
+                    tmp10 = tmp10 * -fix_0_899976223;
+                    tmp11 = tmp11 * -fix_2_562915447;
+                    tmp12 = tmp12 * -fix_0_390180644;
+                    tmp13 = tmp13 * -fix_1_961570560;
+
+                    tmp12 += z1;
+                    tmp13 += z1;
+                    block[y8 + 1] = (tmp0 + tmp10 + tmp12) >> (Bits - Pass1Bits);
+                    block[y8 + 3] = (tmp1 + tmp11 + tmp13) >> (Bits - Pass1Bits);
+                    block[y8 + 5] = (tmp2 + tmp11 + tmp12) >> (Bits - Pass1Bits);
+                    block[y8 + 7] = (tmp3 + tmp10 + tmp13) >> (Bits - Pass1Bits);
+                }
+
+                // Pass 2: process columns.
+                // We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8.
+                for (int x = 0; x < 8; x++)
+                {
+                    int tmp0 = block[x] + block[56 + x];
+                    int tmp1 = block[8 + x] + block[48 + x];
+                    int tmp2 = block[16 + x] + block[40 + x];
+                    int tmp3 = block[24 + x] + block[32 + x];
+
+                    int tmp10 = tmp0 + tmp3 + (1 << (Pass1Bits - 1));
+                    int tmp12 = tmp0 - tmp3;
+                    int tmp11 = tmp1 + tmp2;
+                    int tmp13 = tmp1 - tmp2;
+
+                    tmp0 = block[x] - block[56 + x];
+                    tmp1 = block[8 + x] - block[48 + x];
+                    tmp2 = block[16 + x] - block[40 + x];
+                    tmp3 = block[24 + x] - block[32 + x];
+
+                    block[x] = (tmp10 + tmp11) >> Pass1Bits;
+                    block[32 + x] = (tmp10 - tmp11) >> Pass1Bits;
+
+                    int z1 = (tmp12 + tmp13) * fix_0_541196100;
+                    z1 += 1 << (Bits + Pass1Bits - 1);
+                    block[16 + x] = (z1 + (tmp12 * fix_0_765366865)) >> (Bits + Pass1Bits);
+                    block[48 + x] = (z1 - (tmp13 * fix_1_847759065)) >> (Bits + Pass1Bits);
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp0 + tmp2;
+                    tmp13 = tmp1 + tmp3;
+                    z1 = (tmp12 + tmp13) * fix_1_175875602;
+                    z1 += 1 << (Bits + Pass1Bits - 1);
+                    tmp0 = tmp0 * fix_1_501321110;
+                    tmp1 = tmp1 * fix_3_072711026;
+                    tmp2 = tmp2 * fix_2_053119869;
+                    tmp3 = tmp3 * fix_0_298631336;
+                    tmp10 = tmp10 * -fix_0_899976223;
+                    tmp11 = tmp11 * -fix_2_562915447;
+                    tmp12 = tmp12 * -fix_0_390180644;
+                    tmp13 = tmp13 * -fix_1_961570560;
+
+                    tmp12 += z1;
+                    tmp13 += z1;
+                    block[8 + x] = (tmp0 + tmp10 + tmp12) >> (Bits + Pass1Bits);
+                    block[24 + x] = (tmp1 + tmp11 + tmp13) >> (Bits + Pass1Bits);
+                    block[40 + x] = (tmp2 + tmp11 + tmp12) >> (Bits + Pass1Bits);
+                    block[56 + x] = (tmp3 + tmp10 + tmp13) >> (Bits + Pass1Bits);
+                }
+
+            }
+            private const int w1 = 2841; // 2048*sqrt(2)*cos(1*pi/16)
+            private const int w2 = 2676; // 2048*sqrt(2)*cos(2*pi/16)
+            private const int w3 = 2408; // 2048*sqrt(2)*cos(3*pi/16)
+            private const int w5 = 1609; // 2048*sqrt(2)*cos(5*pi/16)
+            private const int w6 = 1108; // 2048*sqrt(2)*cos(6*pi/16)
+            private const int w7 = 565;  // 2048*sqrt(2)*cos(7*pi/16)
+
+            private const int w1pw7 = w1 + w7;
+            private const int w1mw7 = w1 - w7;
+            private const int w2pw6 = w2 + w6;
+            private const int w2mw6 = w2 - w6;
+            private const int w3pw5 = w3 + w5;
+            private const int w3mw5 = w3 - w5;
+
+            private const int r2 = 181; // 256/sqrt(2)
+
+            /// <summary>
+            /// Performs a 2-D Inverse Discrete Cosine Transformation.
+            /// <para>
+            /// The input coefficients should already have been multiplied by the
+            /// appropriate quantization table. We use fixed-point computation, with the
+            /// number of bits for the fractional component varying over the intermediate
+            /// stages.
+            /// </para>
+            /// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
+            /// discrete W transform and for the discrete Fourier transform", IEEE Trans. on
+            /// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
+            /// </summary>
+            /// <param name="src">The source block of coefficients</param>
+            public static void TransformIDCTInplace(Span<int> src)
+            {
+                // Horizontal 1-D IDCT.
+                for (int y = 0; y < 8; y++)
+                {
+                    int y8 = y * 8;
+
+                    // If all the AC components are zero, then the IDCT is trivial.
+                    if (src[y8 + 1] == 0 && src[y8 + 2] == 0 && src[y8 + 3] == 0 &&
+                        src[y8 + 4] == 0 && src[y8 + 5] == 0 && src[y8 + 6] == 0 && src[y8 + 7] == 0)
+                    {
+                        int dc = src[y8 + 0] << 3;
+                        src[y8 + 0] = dc;
+                        src[y8 + 1] = dc;
+                        src[y8 + 2] = dc;
+                        src[y8 + 3] = dc;
+                        src[y8 + 4] = dc;
+                        src[y8 + 5] = dc;
+                        src[y8 + 6] = dc;
+                        src[y8 + 7] = dc;
+                        continue;
+                    }
+
+                    // Prescale.
+                    int x0 = (src[y8 + 0] << 11) + 128;
+                    int x1 = src[y8 + 4] << 11;
+                    int x2 = src[y8 + 6];
+                    int x3 = src[y8 + 2];
+                    int x4 = src[y8 + 1];
+                    int x5 = src[y8 + 7];
+                    int x6 = src[y8 + 5];
+                    int x7 = src[y8 + 3];
+
+                    // Stage 1.
+                    int x8 = w7 * (x4 + x5);
+                    x4 = x8 + (w1mw7 * x4);
+                    x5 = x8 - (w1pw7 * x5);
+                    x8 = w3 * (x6 + x7);
+                    x6 = x8 - (w3mw5 * x6);
+                    x7 = x8 - (w3pw5 * x7);
+
+                    // Stage 2.
+                    x8 = x0 + x1;
+                    x0 -= x1;
+                    x1 = w6 * (x3 + x2);
+                    x2 = x1 - (w2pw6 * x2);
+                    x3 = x1 + (w2mw6 * x3);
+                    x1 = x4 + x6;
+                    x4 -= x6;
+                    x6 = x5 + x7;
+                    x5 -= x7;
+
+                    // Stage 3.
+                    x7 = x8 + x3;
+                    x8 -= x3;
+                    x3 = x0 + x2;
+                    x0 -= x2;
+                    x2 = ((r2 * (x4 + x5)) + 128) >> 8;
+                    x4 = ((r2 * (x4 - x5)) + 128) >> 8;
+
+                    // Stage 4.
+                    src[y8 + 0] = (x7 + x1) >> 8;
+                    src[y8 + 1] = (x3 + x2) >> 8;
+                    src[y8 + 2] = (x0 + x4) >> 8;
+                    src[y8 + 3] = (x8 + x6) >> 8;
+                    src[y8 + 4] = (x8 - x6) >> 8;
+                    src[y8 + 5] = (x0 - x4) >> 8;
+                    src[y8 + 6] = (x3 - x2) >> 8;
+                    src[y8 + 7] = (x7 - x1) >> 8;
+                }
+
+                // Vertical 1-D IDCT.
+                for (int x = 0; x < 8; x++)
+                {
+                    // Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
+                    // However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
+                    // we do not bother to check for the all-zero case.
+
+                    // Prescale.
+                    int y0 = (src[x] << 8) + 8192;
+                    int y1 = src[32 + x] << 8;
+                    int y2 = src[48 + x];
+                    int y3 = src[16 + x];
+                    int y4 = src[8 + x];
+                    int y5 = src[56 + x];
+                    int y6 = src[40 + x];
+                    int y7 = src[24 + x];
+
+                    // Stage 1.
+                    int y8 = (w7 * (y4 + y5)) + 4;
+                    y4 = (y8 + (w1mw7 * y4)) >> 3;
+                    y5 = (y8 - (w1pw7 * y5)) >> 3;
+                    y8 = (w3 * (y6 + y7)) + 4;
+                    y6 = (y8 - (w3mw5 * y6)) >> 3;
+                    y7 = (y8 - (w3pw5 * y7)) >> 3;
+
+                    // Stage 2.
+                    y8 = y0 + y1;
+                    y0 -= y1;
+                    y1 = (w6 * (y3 + y2)) + 4;
+                    y2 = (y1 - (w2pw6 * y2)) >> 3;
+                    y3 = (y1 + (w2mw6 * y3)) >> 3;
+                    y1 = y4 + y6;
+                    y4 -= y6;
+                    y6 = y5 + y7;
+                    y5 -= y7;
+
+                    // Stage 3.
+                    y7 = y8 + y3;
+                    y8 -= y3;
+                    y3 = y0 + y2;
+                    y0 -= y2;
+                    y2 = ((r2 * (y4 + y5)) + 128) >> 8;
+                    y4 = ((r2 * (y4 - y5)) + 128) >> 8;
+
+                    // Stage 4.
+                    src[x] = (y7 + y1) >> 14;
+                    src[8 + x] = (y3 + y2) >> 14;
+                    src[16 + x] = (y0 + y4) >> 14;
+                    src[24 + x] = (y8 + y6) >> 14;
+                    src[32 + x] = (y8 - y6) >> 14;
+                    src[40 + x] = (y0 - y4) >> 14;
+                    src[48 + x] = (y3 - y2) >> 14;
+                    src[56 + x] = (y7 - y1) >> 14;
+                }
+            }
+        }
+    }
+}
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
@ -0,0 +1,135 @@
+// Copyright (c) Six Labors and contributors.
+// Licensed under the Apache License, Version 2.0.
+
+
+
+// ReSharper disable InconsistentNaming
+
+
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
+{
+    using System;
+    using System.Runtime.CompilerServices;
+
+    using SixLabors.ImageSharp.Formats.Jpeg.Common;
+
+    /// <summary>
+    /// This class contains simplified (unefficient) reference implementations to produce verification data for unit tests
+    /// Floating point DCT code Ported from https://github.com/norishigefukushima/dct_simd
+    /// </summary>
+    internal static partial class ReferenceImplementations
+    {
+        /// <summary>
+        /// Transpose 8x8 block stored linearly in a <see cref="Span{T}"/> (inplace)
+        /// </summary>
+        /// <param name="data"></param>
+        internal static void Transpose8x8(Span<float> data)
+        {
+            for (int i = 1; i < 8; i++)
+            {
+                int i8 = i * 8;
+                for (int j = 0; j < i; j++)
+                {
+                    float tmp = data[i8 + j];
+                    data[i8 + j] = data[j * 8 + i];
+                    data[j * 8 + i] = tmp;
+                }
+            }
+        }
+
+        /// <summary>
+        /// Transpose 8x8 block stored linearly in a  <see cref="Span{T}"/>
+        /// </summary>
+        internal static void Transpose8x8(Span<float> src, Span<float> dest)
+        {
+            for (int i = 0; i < 8; i++)
+            {
+                int i8 = i * 8;
+                for (int j = 0; j < 8; j++)
+                {
+                    dest[j * 8 + i] = src[i8 + j];
+                }
+            }
+        }
+
+        /// <summary>
+        /// Copies color values from block to the destination image buffer.
+        /// </summary>
+        /// <param name="block"></param>
+        /// <param name="buffer"></param>
+        /// <param name="stride"></param>
+        internal static unsafe void CopyColorsTo(ref Block8x8F block, Span<byte> buffer, int stride)
+        {
+            fixed (Block8x8F* p = &block)
+            {
+                float* b = (float*)p;
+
+                for (int y = 0; y < 8; y++)
+                {
+                    int y8 = y * 8;
+                    int yStride = y * stride;
+
+                    for (int x = 0; x < 8; x++)
+                    {
+                        float c = b[y8 + x];
+
+                        if (c < -128)
+                        {
+                            c = 0;
+                        }
+                        else if (c > 127)
+                        {
+                            c = 255;
+                        }
+                        else
+                        {
+                            c += 128;
+                        }
+
+                        buffer[yStride + x] = (byte)c;
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        /// Reference implementation to test <see cref="Block8x8F.UnzigDivRound"/>.
+        /// Rounding is done used an integer-based algorithm defined in <see cref="RationalRound(int,int)"/>.
+        /// </summary>
+        /// <param name="src">The input block</param>
+        /// <param name="dest">The destination block of integers</param>
+        /// <param name="qt">The quantization table</param>
+        /// <param name="unzigPtr">Pointer to <see cref="UnzigData.Data"/> </param>
+        public static unsafe void UnZigDivRoundRational(Block8x8F* src, int* dest, Block8x8F* qt, int* unzigPtr)
+        {
+            float* s = (float*)src;
+            float* q = (float*)qt;
+
+            for (int zig = 0; zig < Block8x8F.Size; zig++)
+            {
+                int a = (int)s[unzigPtr[zig]];
+                int b = (int)q[zig];
+
+                int val = RationalRound(a, b);
+                dest[zig] = val;
+            }
+        }
+
+        /// <summary>
+        /// Rounds a rational number defined as dividend/divisor into an integer
+        /// </summary>
+        /// <param name="dividend">The dividend</param>
+        /// <param name="divisor">The divisior</param>
+        /// <returns></returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int RationalRound(int dividend, int divisor)
+        {
+            if (dividend >= 0)
+            {
+                return (dividend + (divisor >> 1)) / divisor;
+            }
+
+            return -((-dividend + (divisor >> 1)) / divisor);
+        }
+    }
+}
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/VerifyJpeg.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/VerifyJpeg.cs
@ -1,4 +1,4 @@
-namespace SixLabors.ImageSharp.Tests
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils
 {
    using System.Collections.Generic;
    using System.Linq;
--- a/tests/ImageSharp.Tests/Formats/Jpg/YCbCrImageTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/YCbCrImageTests.cs
@ -1,13 +1,14 @@
 // Copyright (c) Six Labors and contributors.
 // Licensed under the Apache License, Version 2.0.

-using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Components.Decoder;
-using SixLabors.Primitives;
-using Xunit;
-using Xunit.Abstractions;
-
-namespace SixLabors.ImageSharp.Tests
+namespace SixLabors.ImageSharp.Tests.Formats.Jpg
 {
+    using SixLabors.ImageSharp.Formats.Jpeg.GolangPort.Components.Decoder;
+    using SixLabors.Primitives;
+
+    using Xunit;
+    using Xunit.Abstractions;
+
    public class YCbCrImageTests
    {
        public YCbCrImageTests(ITestOutputHelper output)
--- a/tests/ImageSharp.Tests/ImageSharp.Tests.csproj
+++ b/tests/ImageSharp.Tests/ImageSharp.Tests.csproj
@ -40,5 +40,6 @@
  </ItemGroup>
  <ItemGroup>
    <Folder Include="TestUtilities\Factories\" />
+    <Folder Include="Utils\Jpg\Formats\" />
  </ItemGroup>
 </Project>