(re) added DCT, Test & ReferenceImplementations changes

9 years ago · a1cbb48077
10 changed files with 1600 additions and 182 deletions
--- a/src/ImageSharp/Formats/Jpg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpg/Components/Block8x8F.cs
@ -160,28 +160,50 @@ namespace ImageSharp.Formats
        /// <summary>
        /// Multiply in place
        /// </summary>
-        /// <param name="scalar">Scalar to multiply by</param>
+        /// <param name="scaleVec">Vector to multiply by</param>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public void MultiplyAllInplace(Vector4 scalar)
+        public void MultiplyAllInplace(Vector4 scaleVec)
        {
-            this.V0L *= scalar;
-            this.V0R *= scalar;
-            this.V1L *= scalar;
-            this.V1R *= scalar;
-            this.V2L *= scalar;
-            this.V2R *= scalar;
-            this.V3L *= scalar;
-            this.V3R *= scalar;
-            this.V4L *= scalar;
-            this.V4R *= scalar;
-            this.V5L *= scalar;
-            this.V5R *= scalar;
-            this.V6L *= scalar;
-            this.V6R *= scalar;
-            this.V7L *= scalar;
-            this.V7R *= scalar;
+            this.V0L *= scaleVec;
+            this.V0R *= scaleVec;
+            this.V1L *= scaleVec;
+            this.V1R *= scaleVec;
+            this.V2L *= scaleVec;
+            this.V2R *= scaleVec;
+            this.V3L *= scaleVec;
+            this.V3R *= scaleVec;
+            this.V4L *= scaleVec;
+            this.V4R *= scaleVec;
+            this.V5L *= scaleVec;
+            this.V5R *= scaleVec;
+            this.V6L *= scaleVec;
+            this.V6R *= scaleVec;
+            this.V7L *= scaleVec;
+            this.V7R *= scaleVec;
        }

+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public void AddToAllInplace(Vector4 diff)
+        {
+            this.V0L += diff;
+            this.V0R += diff;
+            this.V1L += diff;
+            this.V1R += diff;
+            this.V2L += diff;
+            this.V2R += diff;
+            this.V3L += diff;
+            this.V3R += diff;
+            this.V4L += diff;
+            this.V4R += diff;
+            this.V5L += diff;
+            this.V5R += diff;
+            this.V6L += diff;
+            this.V6R += diff;
+            this.V7L += diff;
+            this.V7R += diff;
+        }
+
+
        /// <summary>
        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization)
        /// </summary>
@ -206,7 +228,7 @@ namespace ImageSharp.Formats
        /// </summary>
        /// <param name="blockPtr">Block pointer</param>
        /// <param name="idx">Index</param>
-        /// <returns>The scalar value at the specified index</returns>
+        /// <returns>The scaleVec value at the specified index</returns>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static unsafe float GetScalarAt(Block8x8F* blockPtr, int idx)
        {
@ -420,6 +442,7 @@ namespace ImageSharp.Formats
            this.CopyTo(legacyBlock.Data);
        }

+        
        /// <summary>
        /// Level shift by +128, clip to [0, 255], and write to buffer.
        /// </summary>
--- a/src/ImageSharp/Formats/Jpg/Components/DCT.cs
+++ b/src/ImageSharp/Formats/Jpg/Components/DCT.cs
@ -0,0 +1,395 @@
+// <copyright file="DCT.cs" company="James Jackson-South">
+// Copyright (c) James Jackson-South and contributors.
+// Licensed under the Apache License, Version 2.0.
+// </copyright>
+// ReSharper disable InconsistentNaming
+namespace ImageSharp.Formats
+{
+    using System.Numerics;
+    using System.Runtime.CompilerServices;
+
+    /// <summary>
+    /// Contains forward & inverse DCT implementations
+    /// </summary>
+    internal static class DCT
+    {
+        /// <summary>
+        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization)
+        /// </summary>
+        /// <param name="src">Source</param>
+        /// <param name="dest">Destination</param>
+        /// <param name="temp">Temporary block provided by the caller</param>
+        public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
+        {
+            src.TransposeInto(ref temp);
+            IDCT8x4_LeftPart(ref temp, ref dest);
+            IDCT8x4_RightPart(ref temp, ref dest);
+
+            dest.TransposeInto(ref temp);
+
+            IDCT8x4_LeftPart(ref temp, ref dest);
+            IDCT8x4_RightPart(ref temp, ref dest);
+
+            dest.MultiplyAllInplace(C_0_125);
+        }
+
+#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
+        private static readonly Vector4 C_1_175876 = new Vector4(1.175876f);
+        private static readonly Vector4 C_1_961571 = new Vector4(-1.961571f);
+        private static readonly Vector4 C_0_390181 = new Vector4(-0.390181f);
+        private static readonly Vector4 C_0_899976 = new Vector4(-0.899976f);
+        private static readonly Vector4 C_2_562915 = new Vector4(-2.562915f);
+        private static readonly Vector4 C_0_298631 = new Vector4(0.298631f);
+        private static readonly Vector4 C_2_053120 = new Vector4(2.053120f);
+        private static readonly Vector4 C_3_072711 = new Vector4(3.072711f);
+        private static readonly Vector4 C_1_501321 = new Vector4(1.501321f);
+        private static readonly Vector4 C_0_541196 = new Vector4(0.541196f);
+        private static readonly Vector4 C_1_847759 = new Vector4(-1.847759f);
+        private static readonly Vector4 C_0_765367 = new Vector4(0.765367f);
+
+        private static readonly Vector4 C_0_125 = new Vector4(0.1250f);
+#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
+        private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f);
+
+        /// <summary>
+        /// Do IDCT internal operations on the left part of the block. Original source:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="d">Destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1L;
+            Vector4 my7 = s.V7L;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3L;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5L;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = ((mz0 + mz1) * C_1_175876);
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2L;
+            Vector4 my6 = s.V6L;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0L;
+            Vector4 my4 = s.V4L;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0L = my0 + mb0;
+            d.V7L = my0 - mb0;
+            d.V1L = my1 + mb1;
+            d.V6L = my1 - mb1;
+            d.V2L = my2 + mb2;
+            d.V5L = my2 - mb2;
+            d.V3L = my3 + mb3;
+            d.V4L = my3 - mb3;
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the right part of the block.
+        /// Original source:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1R;
+            Vector4 my7 = s.V7R;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3R;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5R;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2R;
+            Vector4 my6 = s.V6R;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0R;
+            Vector4 my4 = s.V4R;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0R = my0 + mb0;
+            d.V7R = my0 - mb0;
+            d.V1R = my1 + mb1;
+            d.V6R = my1 - mb1;
+            d.V2R = my2 + mb2;
+            d.V5R = my2 - mb2;
+            d.V3R = my3 + mb3;
+            d.V4R = my3 - mb3;
+        }
+
+
+        /// <summary>
+        /// Original:
+        /// <see>
+        ///     <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
+        /// </see>
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void FDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 c0 = s.V0L;
+            Vector4 c1 = s.V7L;
+            Vector4 t0 = (c0 + c1);
+            Vector4 t7 = (c0 - c1);
+
+            c1 = s.V6L;
+            c0 = s.V1L;
+            Vector4 t1 = (c0 + c1);
+            Vector4 t6 = (c0 - c1);
+
+            c1 = s.V5L;
+            c0 = s.V2L;
+            Vector4 t2 = (c0 + c1);
+            Vector4 t5 = (c0 - c1);
+
+            c0 = s.V3L;
+            c1 = s.V4L;
+            Vector4 t3 = (c0 + c1);
+            Vector4 t4 = (c0 - c1);
+
+            /*
+            c1 = x[0]; c2 = x[7]; t0 = c1 + c2; t7 = c1 - c2;
+            c1 = x[1]; c2 = x[6]; t1 = c1 + c2; t6 = c1 - c2;
+            c1 = x[2]; c2 = x[5]; t2 = c1 + c2; t5 = c1 - c2;
+            c1 = x[3]; c2 = x[4]; t3 = c1 + c2; t4 = c1 - c2;
+            */
+
+            c0 = (t0 + t3);
+            Vector4 c3 = (t0 - t3);
+            c1 = (t1 + t2);
+            Vector4 c2 = (t1 - t2);
+
+            /*
+	        c0 = t0 + t3; c3 = t0 - t3;
+	        c1 = t1 + t2; c2 = t1 - t2;
+	        */
+
+            d.V0L = c0 + c1;
+            d.V4L = c0 - c1;
+
+            /*y[0] = c0 + c1;
+            y[4] = c0 - c1;*/
+
+            Vector4 w0 = new Vector4(0.541196f);
+            Vector4 w1 = new Vector4(1.306563f);
+
+
+            d.V2L = (w0 * c2) + (w1 * c3);
+
+            d.V6L = (w0 * c3) - (w1 * c2);
+            /*
+            y[2] = c2 * r[6] + c3 * r[2];
+            y[6] = c3 * r[6] - c2 * r[2];
+            */
+
+            w0 = new Vector4(1.175876f);
+            w1 = new Vector4(0.785695f);
+            c3 = ((w0 * t4) + (w1 * t7));
+            c0 = ((w0 * t7) - (w1 * t4));
+            /*
+            c3 = t4 * r[3] + t7 * r[5];
+            c0 = t7 * r[3] - t4 * r[5];
+            */
+
+            w0 = new Vector4(1.387040f);
+            w1 = new Vector4(0.275899f);
+            c2 = ((w0 * t5) + (w1 * t6));
+            c1 = ((w0 * t6) - (w1 * t5));
+            /*
+	        c2 = t5 * r[1] + t6 * r[7];
+	        c1 = t6 * r[1] - t5 * r[7];
+	        */
+
+            d.V3L = (c0 - c2);
+
+            d.V5L = (c3 - c1);
+            //y[5] = c3 - c1; y[3] = c0 - c2;
+
+            Vector4 invsqrt2 = new Vector4(0.707107f);
+            c0 = ((c0 + c2) * invsqrt2);
+            c3 = ((c3 + c1) * invsqrt2);
+            //c0 = (c0 + c2) * invsqrt2;
+            //c3 = (c3 + c1) * invsqrt2;
+
+            d.V1L = (c0 + c3);
+
+            d.V7L = (c0 - c3);
+            //y[1] = c0 + c3; y[7] = c0 - c3;
+
+            /*for(i = 0;i < 8;i++)
+            { 
+            y[i] *= invsqrt2h; 
+            }*/
+        }
+
+        /// <summary>
+        /// Original:
+        /// <see>
+        ///     <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
+        /// </see>
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void FDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 c0 = s.V0R;
+            Vector4 c1 = s.V7R;
+            Vector4 t0 = (c0 + c1);
+            Vector4 t7 = (c0 - c1);
+
+            c1 = s.V6R;
+            c0 = s.V1R;
+            Vector4 t1 = (c0 + c1);
+            Vector4 t6 = (c0 - c1);
+
+            c1 = s.V5R;
+            c0 = s.V2R;
+            Vector4 t2 = (c0 + c1);
+            Vector4 t5 = (c0 - c1);
+
+            c0 = s.V3R;
+            c1 = s.V4R;
+            Vector4 t3 = (c0 + c1);
+            Vector4 t4 = (c0 - c1);
+
+            /*
+            c1 = x[0]; c2 = x[7]; t0 = c1 + c2; t7 = c1 - c2;
+            c1 = x[1]; c2 = x[6]; t1 = c1 + c2; t6 = c1 - c2;
+            c1 = x[2]; c2 = x[5]; t2 = c1 + c2; t5 = c1 - c2;
+            c1 = x[3]; c2 = x[4]; t3 = c1 + c2; t4 = c1 - c2;
+            */
+
+            c0 = (t0 + t3);
+            Vector4 c3 = (t0 - t3);
+            c1 = (t1 + t2);
+            Vector4 c2 = (t1 - t2);
+
+            /*
+              c0 = t0 + t3; c3 = t0 - t3;
+              c1 = t1 + t2; c2 = t1 - t2;
+              */
+
+            d.V0R = c0 + c1;
+            d.V4R = c0 - c1;
+
+            /*y[0] = c0 + c1;
+            y[4] = c0 - c1;*/
+
+            Vector4 w0 = new Vector4(0.541196f);
+            Vector4 w1 = new Vector4(1.306563f);
+
+
+            d.V2R = (w0 * c2) + (w1 * c3);
+
+            d.V6R = (w0 * c3) - (w1 * c2);
+            /*
+            y[2] = c2 * r[6] + c3 * r[2];
+            y[6] = c3 * r[6] - c2 * r[2];
+            */
+
+            w0 = new Vector4(1.175876f);
+            w1 = new Vector4(0.785695f);
+            c3 = ((w0 * t4) + (w1 * t7));
+            c0 = ((w0 * t7) - (w1 * t4));
+            /*
+            c3 = t4 * r[3] + t7 * r[5];
+            c0 = t7 * r[3] - t4 * r[5];
+            */
+
+            w0 = new Vector4(1.387040f);
+            w1 = new Vector4(0.275899f);
+            c2 = ((w0 * t5) + (w1 * t6));
+            c1 = ((w0 * t6) - (w1 * t5));
+            /*
+              c2 = t5 * r[1] + t6 * r[7];
+              c1 = t6 * r[1] - t5 * r[7];
+              */
+
+            d.V3R = (c0 - c2);
+
+            d.V5R = (c3 - c1);
+            //y[5] = c3 - c1; y[3] = c0 - c2;
+
+            c0 = ((c0 + c2) * InvSqrt2);
+            c3 = ((c3 + c1) * InvSqrt2);
+            //c0 = (c0 + c2) * invsqrt2;
+            //c3 = (c3 + c1) * invsqrt2;
+
+            d.V1R = (c0 + c3);
+
+            d.V7R = (c0 - c3);
+            //y[1] = c0 + c3; y[7] = c0 - c3;
+
+            /*for(i = 0;i < 8;i++)
+            { 
+            y[i] *= invsqrt2h; 
+            }*/
+        }
+
+        public static void TransformFDCT(ref Block8x8F s, ref Block8x8F d, ref Block8x8F temp, bool offsetSourceByNeg128 = true)
+        {
+            s.TransposeInto(ref temp);
+            if (offsetSourceByNeg128)
+            {
+                temp.AddToAllInplace(new Vector4(-128));
+            }
+
+            FDCT8x4_LeftPart(ref temp, ref d);
+            FDCT8x4_RightPart(ref temp, ref d);
+
+            d.TransposeInto(ref temp);
+            
+            FDCT8x4_LeftPart(ref temp, ref d);
+            FDCT8x4_RightPart(ref temp, ref d);
+            
+            d.MultiplyAllInplace(C_0_125);
+        }
+    }
+}
--- a/src/ImageSharp/Formats/Jpg/Components/MutableSpanExtensions.cs
+++ b/src/ImageSharp/Formats/Jpg/Components/MutableSpanExtensions.cs
@ -3,7 +3,7 @@
 // Licensed under the Apache License, Version 2.0.
 // </copyright>

-namespace ImageSharp.Formats.Jpg.Components
+namespace ImageSharp.Formats
 {
    using System.Numerics;
    using System.Runtime.CompilerServices;
@ -77,5 +77,58 @@ namespace ImageSharp.Formats.Jpg.Components
            data[2] = (int)v.Z;
            data[3] = (int)v.W;
        }
+
+
+
+        public static MutableSpan<float> ConvertToFloat32MutableSpan(this MutableSpan<int> src)
+        {
+            MutableSpan<float> result = new MutableSpan<float>(src.TotalCount);
+            for (int i = 0; i < src.TotalCount; i++)
+            {
+                result[i] = (float)src[i];
+            }
+            return result;
+        }
+
+        public static MutableSpan<int> ConvertToInt32MutableSpan(this MutableSpan<float> src)
+        {
+            MutableSpan<int> result = new MutableSpan<int>(src.TotalCount);
+            for (int i = 0; i < src.TotalCount; i++)
+            {
+                result[i] = (int)src[i];
+            }
+            return result;
+        }
+
+        public static MutableSpan<float> AddScalarToAllValues(this MutableSpan<float> src, float scalar)
+        {
+            MutableSpan<float> result = new MutableSpan<float>(src.TotalCount);
+            for (int i = 0; i < src.TotalCount; i++)
+            {
+                result[i] = src[i] + scalar;
+            }
+            return result;
+        }
+
+        public static MutableSpan<int> AddScalarToAllValues(this MutableSpan<int> src, int scalar)
+        {
+            MutableSpan<int> result = new MutableSpan<int>(src.TotalCount);
+            for (int i = 0; i < src.TotalCount; i++)
+            {
+                result[i] = src[i] + scalar;
+            }
+            return result;
+        }
+
+
+        public static MutableSpan<T> Copy<T>(this MutableSpan<T> src)
+        {
+            MutableSpan<T> result = new MutableSpan<T>(src.TotalCount);
+            for (int i = 0; i < src.TotalCount; i++)
+            {
+                result[i] = src[i];
+            }
+            return result;
+        }
    }
 }
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@ -1,20 +1,18 @@
 // Uncomment this to turn unit tests into benchmarks:
 //#define BENCHMARKING

-using System;
-using System.Collections.Generic;
-using System.Diagnostics;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using System.Text;
-using ImageSharp.Formats;
-using Xunit;
-using Xunit.Abstractions;
 // ReSharper disable InconsistentNaming

 namespace ImageSharp.Tests.Formats.Jpg
 {
+    using System.Diagnostics;
+    using System.Numerics;
+
+    using ImageSharp.Formats;
+
+    using Xunit;
+    using Xunit.Abstractions;
+
    public class Block8x8FTests : UtilityTestClassBase
    {
 #if BENCHMARKING
@ -23,7 +21,8 @@ namespace ImageSharp.Tests.Formats.Jpg
        public const int Times = 1;
 #endif

-        public Block8x8FTests(ITestOutputHelper output) : base(output)
+        public Block8x8FTests(ITestOutputHelper output)
+            : base(output)
        {
        }

@ -31,20 +30,23 @@ namespace ImageSharp.Tests.Formats.Jpg
        public void Indexer()
        {
            float sum = 0;
-            this.Measure(Times, () =>
-            {
-                Block8x8F block = new Block8x8F();
-
-                for (int i = 0; i < Block8x8F.ScalarCount; i++)
-                {
-                    block[i] = i;
-                }
-                sum = 0;
-                for (int i = 0; i < Block8x8F.ScalarCount; i++)
-                {
-                    sum += block[i];
-                }
-                });
+            this.Measure(
+                Times,
+                () =>
+                    {
+                        Block8x8F block = new Block8x8F();
+
+                        for (int i = 0; i < Block8x8F.ScalarCount; i++)
+                        {
+                            block[i] = i;
+                        }
+
+                        sum = 0;
+                        for (int i = 0; i < Block8x8F.ScalarCount; i++)
+                        {
+                            sum += block[i];
+                        }
+                    });
            Assert.Equal(sum, 64f * 63f * 0.5f);
        }

@ -52,21 +54,24 @@ namespace ImageSharp.Tests.Formats.Jpg
        public unsafe void Indexer_GetScalarAt_SetScalarAt()
        {
            float sum = 0;
-            this.Measure(Times, () =>
-            {
-                Block8x8F block = new Block8x8F();
-
-                for (int i = 0; i < Block8x8F.ScalarCount; i++)
-                {
-                    Block8x8F.SetScalarAt(&block, i, i);
-                }
-                sum = 0;
-                for (int i = 0; i < Block8x8F.ScalarCount; i++)
-                {
-                    sum += Block8x8F.GetScalarAt(&block, i);
-                }
-            });
-            Assert.Equal(sum, 64f*63f*0.5f);
+            this.Measure(
+                Times,
+                () =>
+                    {
+                        Block8x8F block = new Block8x8F();
+
+                        for (int i = 0; i < Block8x8F.ScalarCount; i++)
+                        {
+                            Block8x8F.SetScalarAt(&block, i, i);
+                        }
+
+                        sum = 0;
+                        for (int i = 0; i < Block8x8F.ScalarCount; i++)
+                        {
+                            sum += Block8x8F.GetScalarAt(&block, i);
+                        }
+                    });
+            Assert.Equal(sum, 64f * 63f * 0.5f);
        }

        [Fact]
@ -74,21 +79,24 @@ namespace ImageSharp.Tests.Formats.Jpg
        {
            float sum = 0;

-            this.Measure(Times, () =>
-            {
-                //Block8x8F block = new Block8x8F();
-                float[] block = new float[64];
-                for (int i = 0; i < Block8x8F.ScalarCount; i++)
-                {
-                    block[i] = i;
-                }
-                sum = 0;
-                for (int i = 0; i < Block8x8F.ScalarCount; i++)
-                {
-                    sum += block[i];
-                }
-            });
-            Assert.Equal(sum, 64f*63f*0.5f);
+            this.Measure(
+                Times,
+                () =>
+                    {
+                        // Block8x8F block = new Block8x8F();
+                        float[] block = new float[64];
+                        for (int i = 0; i < Block8x8F.ScalarCount; i++)
+                        {
+                            block[i] = i;
+                        }
+
+                        sum = 0;
+                        for (int i = 0; i < Block8x8F.ScalarCount; i++)
+                        {
+                            sum += block[i];
+                        }
+                    });
+            Assert.Equal(sum, 64f * 63f * 0.5f);
        }

        [Fact]
@ -101,15 +109,19 @@ namespace ImageSharp.Tests.Formats.Jpg
            {
                data[i] = i;
            }
-            Measure(Times, () =>
-            {
-                Block8x8F b = new Block8x8F();
-                b.LoadFrom(data);
-                b.CopyTo(mirror);
-            });
+
+            this.Measure(
+                Times,
+                () =>
+                    {
+                        Block8x8F b = new Block8x8F();
+                        b.LoadFrom(data);
+                        b.CopyTo(mirror);
+                    });

            Assert.Equal(data, mirror);
-            //PrintLinearData((MutableSpan<float>)mirror);
+
+            // PrintLinearData((MutableSpan<float>)mirror);
        }

        [Fact]
@ -122,15 +134,19 @@ namespace ImageSharp.Tests.Formats.Jpg
            {
                data[i] = i;
            }
-            Measure(Times, () =>
-            {
-                Block8x8F b = new Block8x8F();
-                Block8x8F.LoadFrom(&b, data);
-                Block8x8F.CopyTo(&b, mirror);
-            });
+
+            this.Measure(
+                Times,
+                () =>
+                    {
+                        Block8x8F b = new Block8x8F();
+                        Block8x8F.LoadFrom(&b, data);
+                        Block8x8F.CopyTo(&b, mirror);
+                    });

            Assert.Equal(data, mirror);
-            //PrintLinearData((MutableSpan<float>)mirror);
+
+            // PrintLinearData((MutableSpan<float>)mirror);
        }

        [Fact]
@ -143,19 +159,21 @@ namespace ImageSharp.Tests.Formats.Jpg
            {
                data[i] = i;
            }
-            Measure(Times, () =>
-            {
-                Block8x8F v = new Block8x8F();
-                v.LoadFrom(data);
-                v.CopyTo(mirror);
-            });

-            
+            this.Measure(
+                Times,
+                () =>
+                    {
+                        Block8x8F v = new Block8x8F();
+                        v.LoadFrom(data);
+                        v.CopyTo(mirror);
+                    });
+
            Assert.Equal(data, mirror);
-            //PrintLinearData((MutableSpan<int>)mirror);
+
+            // PrintLinearData((MutableSpan<int>)mirror);
        }
-        
-        
+
        [Fact]
        public void TransposeInto()
        {
@ -173,8 +191,6 @@ namespace ImageSharp.Tests.Formats.Jpg

            Assert.Equal(expected, actual);
        }
-        
-

        private class BufferHolder
        {
@ -188,7 +204,7 @@ namespace ImageSharp.Tests.Formats.Jpg
            source.Buffer.LoadFrom(Create8x8FloatData());
            BufferHolder dest = new BufferHolder();

-            Output.WriteLine($"TranposeInto_PinningImpl_Benchmark X {Times} ...");
+            this.Output.WriteLine($"TranposeInto_PinningImpl_Benchmark X {Times} ...");
            Stopwatch sw = Stopwatch.StartNew();

            for (int i = 0; i < Times; i++)
@ -197,10 +213,9 @@ namespace ImageSharp.Tests.Formats.Jpg
            }

            sw.Stop();
-            Output.WriteLine($"TranposeInto_PinningImpl_Benchmark finished in {sw.ElapsedMilliseconds} ms");
-
+            this.Output.WriteLine($"TranposeInto_PinningImpl_Benchmark finished in {sw.ElapsedMilliseconds} ms");
        }
-        
+
        [Fact]
        public void iDCT2D8x4_LeftPart()
        {
@ -214,14 +229,14 @@ namespace ImageSharp.Tests.Formats.Jpg

            Block8x8F dest = new Block8x8F();

-            source.IDCT8x4_LeftPart(ref dest);
+            DCT.IDCT8x4_LeftPart(ref source, ref dest);

            float[] actualDestArray = new float[64];
            dest.CopyTo(actualDestArray);

-            Print8x8Data(expectedDestArray);
-            Output.WriteLine("**************");
-            Print8x8Data(actualDestArray);
+            this.Print8x8Data(expectedDestArray);
+            this.Output.WriteLine("**************");
+            this.Print8x8Data(actualDestArray);

            Assert.Equal(expectedDestArray, actualDestArray);
        }
@ -239,65 +254,49 @@ namespace ImageSharp.Tests.Formats.Jpg

            Block8x8F dest = new Block8x8F();

-            source.IDCT8x4_RightPart(ref dest);
+            DCT.IDCT8x4_RightPart(ref source, ref dest);

            float[] actualDestArray = new float[64];
            dest.CopyTo(actualDestArray);

-            Print8x8Data(expectedDestArray);
-            Output.WriteLine("**************");
-            Print8x8Data(actualDestArray);
+            this.Print8x8Data(expectedDestArray);
+            this.Output.WriteLine("**************");
+            this.Print8x8Data(actualDestArray);

            Assert.Equal(expectedDestArray.Data, actualDestArray);
        }

-        private struct ApproximateFloatComparer : IEqualityComparer<float>
-        {
-            private const float Eps = 0.0001f;
-
-            public bool Equals(float x, float y)
-            {
-                float d = x - y;
-
-                return d > -Eps && d < Eps;
-            }
-
-            public int GetHashCode(float obj)
-            {
-                throw new InvalidOperationException();
-            }
-        }
-
-        [Fact]
-        public void IDCTInto()
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        [InlineData(3)]
+        public void TransformIDCT(int seed)
        {
-            float[] sourceArray = Create8x8FloatData();
+            var sourceArray = Create8x8RandomFloatData(-200, 200, seed);
            float[] expectedDestArray = new float[64];
            float[] tempArray = new float[64];

            ReferenceImplementations.iDCT2D_llm(sourceArray, expectedDestArray, tempArray);

-            //ReferenceImplementations.iDCT8x8_llm_sse(sourceArray, expectedDestArray, tempArray);
-
+            // ReferenceImplementations.iDCT8x8_llm_sse(sourceArray, expectedDestArray, tempArray);
            Block8x8F source = new Block8x8F();
            source.LoadFrom(sourceArray);

            Block8x8F dest = new Block8x8F();
            Block8x8F tempBuffer = new Block8x8F();

-            source.TransformIDCTInto(ref dest, ref tempBuffer);
+            DCT.TransformIDCT(ref source, ref dest, ref tempBuffer);

            float[] actualDestArray = new float[64];
            dest.CopyTo(actualDestArray);

-            Print8x8Data(expectedDestArray);
-            Output.WriteLine("**************");
-            Print8x8Data(actualDestArray);
-            Assert.Equal(expectedDestArray, actualDestArray, new ApproximateFloatComparer());
-            Assert.Equal(expectedDestArray, actualDestArray, new ApproximateFloatComparer());
+            this.Print8x8Data(expectedDestArray);
+            this.Output.WriteLine("**************");
+            this.Print8x8Data(actualDestArray);
+            Assert.Equal(expectedDestArray, actualDestArray, new ApproximateFloatComparer(1f));
+            Assert.Equal(expectedDestArray, actualDestArray, new ApproximateFloatComparer(1f));
        }

-
        [Fact]
        public unsafe void CopyColorsTo()
        {
@ -308,24 +307,23 @@ namespace ImageSharp.Tests.Formats.Jpg

            int stride = 256;
            int height = 42;
-            int offset = height*10 + 20;
+            int offset = height * 10 + 20;

-            byte[] colorsExpected = new byte[stride*height];
-            byte[] colorsActual = new byte[stride*height];
+            byte[] colorsExpected = new byte[stride * height];
+            byte[] colorsActual = new byte[stride * height];

            Block8x8F temp = new Block8x8F();
-            
+
            ReferenceImplementations.CopyColorsTo(ref block, new MutableSpan<byte>(colorsExpected, offset), stride);

            block.CopyColorsTo(new MutableSpan<byte>(colorsActual, offset), stride, &temp);

-            //Output.WriteLine("******* EXPECTED: *********");
-            //PrintLinearData(colorsExpected);
-            //Output.WriteLine("******** ACTUAL: **********");
-
+            // Output.WriteLine("******* EXPECTED: *********");
+            // PrintLinearData(colorsExpected);
+            // Output.WriteLine("******** ACTUAL: **********");
            Assert.Equal(colorsExpected, colorsActual);
        }
-        
+
        private static float[] Create8x8ColorCropTestData()
        {
            float[] result = new float[64];
@ -336,6 +334,7 @@ namespace ImageSharp.Tests.Formats.Jpg
                    result[i * 8 + j] = -300 + i * 100 + j * 10;
                }
            }
+
            return result;
        }

@ -345,22 +344,88 @@ namespace ImageSharp.Tests.Formats.Jpg
            Block8x8F block = new Block8x8F();
            var input = Create8x8ColorCropTestData();
            block.LoadFrom(input);
-            Output.WriteLine("Input:");
-            PrintLinearData(input);
-            
+            this.Output.WriteLine("Input:");
+            this.PrintLinearData(input);

            Block8x8F dest = new Block8x8F();
            block.TransformByteConvetibleColorValuesInto(ref dest);

            float[] array = new float[64];
            dest.CopyTo(array);
-            Output.WriteLine("Result:");
-            PrintLinearData(array);
+            this.Output.WriteLine("Result:");
+            this.PrintLinearData(array);
            foreach (float val in array)
            {
                Assert.InRange(val, 0, 255);
            }
        }

+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void FDCT8x4_LeftPart(int seed)
+        {
+            var src = Create8x8RandomFloatData(-200, 200, seed);
+            var srcBlock = new Block8x8F();
+            srcBlock.LoadFrom(src);
+
+            var destBlock = new Block8x8F();
+
+            var expectedDest = new MutableSpan<float>(64);
+
+            ReferenceImplementations.fDCT2D8x4_32f(src, expectedDest);
+            DCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock);
+
+            var actualDest = new MutableSpan<float>(64);
+            destBlock.CopyTo(actualDest);
+
+            Assert.Equal(actualDest.Data, expectedDest.Data, new ApproximateFloatComparer(1f));
+        }
+
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void FDCT8x4_RightPart(int seed)
+        {
+            var src = Create8x8RandomFloatData(-200, 200, seed);
+            var srcBlock = new Block8x8F();
+            srcBlock.LoadFrom(src);
+
+            var destBlock = new Block8x8F();
+
+            var expectedDest = new MutableSpan<float>(64);
+
+            ReferenceImplementations.fDCT2D8x4_32f(src.Slice(4), expectedDest.Slice(4));
+            DCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock);
+
+            var actualDest = new MutableSpan<float>(64);
+            destBlock.CopyTo(actualDest);
+
+            Assert.Equal(actualDest.Data, expectedDest.Data, new ApproximateFloatComparer(1f));
+        }
+
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void TransformFDCT(int seed)
+        {
+            var src = Create8x8RandomFloatData(-200, 200, seed);
+            var srcBlock = new Block8x8F();
+            srcBlock.LoadFrom(src);
+
+            var destBlock = new Block8x8F();
+
+            var expectedDest = new MutableSpan<float>(64);
+            var temp1 = new MutableSpan<float>(64);
+            var temp2 = new Block8x8F();
+
+            ReferenceImplementations.fDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
+            DCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false);
+
+            var actualDest = new MutableSpan<float>(64);
+            destBlock.CopyTo(actualDest);
+
+            Assert.Equal(actualDest.Data, expectedDest.Data, new ApproximateFloatComparer(1f));
+        }
    }
 }
--- a/tests/ImageSharp.Tests/Formats/Jpg/JpegTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegTests.cs
@ -10,9 +10,6 @@ namespace ImageSharp.Tests
 {
    public class JpegTests
    {
-        
-        public const string TestOutputDirectory = "TestOutput/Jpeg";
-
        private ITestOutputHelper Output { get; }

        public JpegTests(ITestOutputHelper output)
--- a/tests/ImageSharp.Tests/Formats/Jpg/ReferenceImplementations.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/ReferenceImplementations.cs
@ -2,6 +2,7 @@

 namespace ImageSharp.Tests.Formats.Jpg
 {
+    using System;
    using System.Numerics;
    using System.Runtime.CompilerServices;

@ -9,7 +10,7 @@ namespace ImageSharp.Tests.Formats.Jpg

    /// <summary>
    /// This class contains simplified (unefficient) reference implementations to produce verification data for unit tests
-    /// DCT code Ported from https://github.com/norishigefukushima/dct_simd
+    /// Floating point DCT code Ported from https://github.com/norishigefukushima/dct_simd
    /// </summary>
    internal static class ReferenceImplementations
    {
@ -46,6 +47,310 @@ namespace ImageSharp.Tests.Formats.Jpg
            }
        }

+        public static class IntegerReferenceDCT
+        {
+            private const int fix_0_298631336 = 2446;
+            private const int fix_0_390180644 = 3196;
+            private const int fix_0_541196100 = 4433;
+            private const int fix_0_765366865 = 6270;
+            private const int fix_0_899976223 = 7373;
+            private const int fix_1_175875602 = 9633;
+            private const int fix_1_501321110 = 12299;
+            private const int fix_1_847759065 = 15137;
+            private const int fix_1_961570560 = 16069;
+            private const int fix_2_053119869 = 16819;
+            private const int fix_2_562915447 = 20995;
+            private const int fix_3_072711026 = 25172;
+
+            /// <summary>
+            /// The number of bits
+            /// </summary>
+            private const int Bits = 13;
+
+            /// <summary>
+            /// The number of bits to shift by on the first pass.
+            /// </summary>
+            private const int Pass1Bits = 2;
+
+            /// <summary>
+            /// The value to shift by
+            /// </summary>
+            private const int CenterJSample = 128;
+
+            /// <summary>
+            /// Performs a forward DCT on an 8x8 block of coefficients, including a level shift.
+            /// Leave results scaled up by an overall factor of 8.
+            /// </summary>
+            /// <param name="block">The block of coefficients.</param>
+            public static void TransformFDCTInplace(MutableSpan<int> block)
+            {
+                // Pass 1: process rows.
+                for (int y = 0; y < 8; y++)
+                {
+                    int y8 = y * 8;
+
+                    int x0 = block[y8];
+                    int x1 = block[y8 + 1];
+                    int x2 = block[y8 + 2];
+                    int x3 = block[y8 + 3];
+                    int x4 = block[y8 + 4];
+                    int x5 = block[y8 + 5];
+                    int x6 = block[y8 + 6];
+                    int x7 = block[y8 + 7];
+
+                    int tmp0 = x0 + x7;
+                    int tmp1 = x1 + x6;
+                    int tmp2 = x2 + x5;
+                    int tmp3 = x3 + x4;
+
+                    int tmp10 = tmp0 + tmp3;
+                    int tmp12 = tmp0 - tmp3;
+                    int tmp11 = tmp1 + tmp2;
+                    int tmp13 = tmp1 - tmp2;
+
+                    tmp0 = x0 - x7;
+                    tmp1 = x1 - x6;
+                    tmp2 = x2 - x5;
+                    tmp3 = x3 - x4;
+
+                    block[y8] = (tmp10 + tmp11 - (8 * CenterJSample)) << Pass1Bits;
+                    block[y8 + 4] = (tmp10 - tmp11) << Pass1Bits;
+                    int z1 = (tmp12 + tmp13) * fix_0_541196100;
+                    z1 += 1 << (Bits - Pass1Bits - 1);
+                    block[y8 + 2] = (z1 + (tmp12 * fix_0_765366865)) >> (Bits - Pass1Bits);
+                    block[y8 + 6] = (z1 - (tmp13 * fix_1_847759065)) >> (Bits - Pass1Bits);
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp0 + tmp2;
+                    tmp13 = tmp1 + tmp3;
+                    z1 = (tmp12 + tmp13) * fix_1_175875602;
+                    z1 += 1 << (Bits - Pass1Bits - 1);
+                    tmp0 = tmp0 * fix_1_501321110;
+                    tmp1 = tmp1 * fix_3_072711026;
+                    tmp2 = tmp2 * fix_2_053119869;
+                    tmp3 = tmp3 * fix_0_298631336;
+                    tmp10 = tmp10 * -fix_0_899976223;
+                    tmp11 = tmp11 * -fix_2_562915447;
+                    tmp12 = tmp12 * -fix_0_390180644;
+                    tmp13 = tmp13 * -fix_1_961570560;
+
+                    tmp12 += z1;
+                    tmp13 += z1;
+                    block[y8 + 1] = (tmp0 + tmp10 + tmp12) >> (Bits - Pass1Bits);
+                    block[y8 + 3] = (tmp1 + tmp11 + tmp13) >> (Bits - Pass1Bits);
+                    block[y8 + 5] = (tmp2 + tmp11 + tmp12) >> (Bits - Pass1Bits);
+                    block[y8 + 7] = (tmp3 + tmp10 + tmp13) >> (Bits - Pass1Bits);
+                }
+
+                // Pass 2: process columns.
+                // We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8.
+                for (int x = 0; x < 8; x++)
+                {
+                    int tmp0 = block[x] + block[56 + x];
+                    int tmp1 = block[8 + x] + block[48 + x];
+                    int tmp2 = block[16 + x] + block[40 + x];
+                    int tmp3 = block[24 + x] + block[32 + x];
+
+                    int tmp10 = tmp0 + tmp3 + (1 << (Pass1Bits - 1));
+                    int tmp12 = tmp0 - tmp3;
+                    int tmp11 = tmp1 + tmp2;
+                    int tmp13 = tmp1 - tmp2;
+
+                    tmp0 = block[x] - block[56 + x];
+                    tmp1 = block[8 + x] - block[48 + x];
+                    tmp2 = block[16 + x] - block[40 + x];
+                    tmp3 = block[24 + x] - block[32 + x];
+
+                    block[x] = (tmp10 + tmp11) >> Pass1Bits;
+                    block[32 + x] = (tmp10 - tmp11) >> Pass1Bits;
+
+                    int z1 = (tmp12 + tmp13) * fix_0_541196100;
+                    z1 += 1 << (Bits + Pass1Bits - 1);
+                    block[16 + x] = (z1 + (tmp12 * fix_0_765366865)) >> (Bits + Pass1Bits);
+                    block[48 + x] = (z1 - (tmp13 * fix_1_847759065)) >> (Bits + Pass1Bits);
+
+                    tmp10 = tmp0 + tmp3;
+                    tmp11 = tmp1 + tmp2;
+                    tmp12 = tmp0 + tmp2;
+                    tmp13 = tmp1 + tmp3;
+                    z1 = (tmp12 + tmp13) * fix_1_175875602;
+                    z1 += 1 << (Bits + Pass1Bits - 1);
+                    tmp0 = tmp0 * fix_1_501321110;
+                    tmp1 = tmp1 * fix_3_072711026;
+                    tmp2 = tmp2 * fix_2_053119869;
+                    tmp3 = tmp3 * fix_0_298631336;
+                    tmp10 = tmp10 * -fix_0_899976223;
+                    tmp11 = tmp11 * -fix_2_562915447;
+                    tmp12 = tmp12 * -fix_0_390180644;
+                    tmp13 = tmp13 * -fix_1_961570560;
+
+                    tmp12 += z1;
+                    tmp13 += z1;
+                    block[8 + x] = (tmp0 + tmp10 + tmp12) >> (Bits + Pass1Bits);
+                    block[24 + x] = (tmp1 + tmp11 + tmp13) >> (Bits + Pass1Bits);
+                    block[40 + x] = (tmp2 + tmp11 + tmp12) >> (Bits + Pass1Bits);
+                    block[56 + x] = (tmp3 + tmp10 + tmp13) >> (Bits + Pass1Bits);
+                }
+
+            }
+            private const int w1 = 2841; // 2048*sqrt(2)*cos(1*pi/16)
+            private const int w2 = 2676; // 2048*sqrt(2)*cos(2*pi/16)
+            private const int w3 = 2408; // 2048*sqrt(2)*cos(3*pi/16)
+            private const int w5 = 1609; // 2048*sqrt(2)*cos(5*pi/16)
+            private const int w6 = 1108; // 2048*sqrt(2)*cos(6*pi/16)
+            private const int w7 = 565;  // 2048*sqrt(2)*cos(7*pi/16)
+
+            private const int w1pw7 = w1 + w7;
+            private const int w1mw7 = w1 - w7;
+            private const int w2pw6 = w2 + w6;
+            private const int w2mw6 = w2 - w6;
+            private const int w3pw5 = w3 + w5;
+            private const int w3mw5 = w3 - w5;
+
+            private const int r2 = 181; // 256/sqrt(2)
+
+            /// <summary>
+            /// Performs a 2-D Inverse Discrete Cosine Transformation.
+            /// <para>
+            /// The input coefficients should already have been multiplied by the
+            /// appropriate quantization table. We use fixed-point computation, with the
+            /// number of bits for the fractional component varying over the intermediate
+            /// stages.
+            /// </para>
+            /// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
+            /// discrete W transform and for the discrete Fourier transform", IEEE Trans. on
+            /// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
+            /// </summary>
+            /// <param name="src">The source block of coefficients</param>
+            public static void TransformIDCTInplace(MutableSpan<int> src)
+            {
+                // Horizontal 1-D IDCT.
+                for (int y = 0; y < 8; y++)
+                {
+                    int y8 = y * 8;
+
+                    // If all the AC components are zero, then the IDCT is trivial.
+                    if (src[y8 + 1] == 0 && src[y8 + 2] == 0 && src[y8 + 3] == 0 &&
+                        src[y8 + 4] == 0 && src[y8 + 5] == 0 && src[y8 + 6] == 0 && src[y8 + 7] == 0)
+                    {
+                        int dc = src[y8 + 0] << 3;
+                        src[y8 + 0] = dc;
+                        src[y8 + 1] = dc;
+                        src[y8 + 2] = dc;
+                        src[y8 + 3] = dc;
+                        src[y8 + 4] = dc;
+                        src[y8 + 5] = dc;
+                        src[y8 + 6] = dc;
+                        src[y8 + 7] = dc;
+                        continue;
+                    }
+
+                    // Prescale.
+                    int x0 = (src[y8 + 0] << 11) + 128;
+                    int x1 = src[y8 + 4] << 11;
+                    int x2 = src[y8 + 6];
+                    int x3 = src[y8 + 2];
+                    int x4 = src[y8 + 1];
+                    int x5 = src[y8 + 7];
+                    int x6 = src[y8 + 5];
+                    int x7 = src[y8 + 3];
+
+                    // Stage 1.
+                    int x8 = w7 * (x4 + x5);
+                    x4 = x8 + (w1mw7 * x4);
+                    x5 = x8 - (w1pw7 * x5);
+                    x8 = w3 * (x6 + x7);
+                    x6 = x8 - (w3mw5 * x6);
+                    x7 = x8 - (w3pw5 * x7);
+
+                    // Stage 2.
+                    x8 = x0 + x1;
+                    x0 -= x1;
+                    x1 = w6 * (x3 + x2);
+                    x2 = x1 - (w2pw6 * x2);
+                    x3 = x1 + (w2mw6 * x3);
+                    x1 = x4 + x6;
+                    x4 -= x6;
+                    x6 = x5 + x7;
+                    x5 -= x7;
+
+                    // Stage 3.
+                    x7 = x8 + x3;
+                    x8 -= x3;
+                    x3 = x0 + x2;
+                    x0 -= x2;
+                    x2 = ((r2 * (x4 + x5)) + 128) >> 8;
+                    x4 = ((r2 * (x4 - x5)) + 128) >> 8;
+
+                    // Stage 4.
+                    src[y8 + 0] = (x7 + x1) >> 8;
+                    src[y8 + 1] = (x3 + x2) >> 8;
+                    src[y8 + 2] = (x0 + x4) >> 8;
+                    src[y8 + 3] = (x8 + x6) >> 8;
+                    src[y8 + 4] = (x8 - x6) >> 8;
+                    src[y8 + 5] = (x0 - x4) >> 8;
+                    src[y8 + 6] = (x3 - x2) >> 8;
+                    src[y8 + 7] = (x7 - x1) >> 8;
+                }
+
+                // Vertical 1-D IDCT.
+                for (int x = 0; x < 8; x++)
+                {
+                    // Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
+                    // However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
+                    // we do not bother to check for the all-zero case.
+
+                    // Prescale.
+                    int y0 = (src[x] << 8) + 8192;
+                    int y1 = src[32 + x] << 8;
+                    int y2 = src[48 + x];
+                    int y3 = src[16 + x];
+                    int y4 = src[8 + x];
+                    int y5 = src[56 + x];
+                    int y6 = src[40 + x];
+                    int y7 = src[24 + x];
+
+                    // Stage 1.
+                    int y8 = (w7 * (y4 + y5)) + 4;
+                    y4 = (y8 + (w1mw7 * y4)) >> 3;
+                    y5 = (y8 - (w1pw7 * y5)) >> 3;
+                    y8 = (w3 * (y6 + y7)) + 4;
+                    y6 = (y8 - (w3mw5 * y6)) >> 3;
+                    y7 = (y8 - (w3pw5 * y7)) >> 3;
+
+                    // Stage 2.
+                    y8 = y0 + y1;
+                    y0 -= y1;
+                    y1 = (w6 * (y3 + y2)) + 4;
+                    y2 = (y1 - (w2pw6 * y2)) >> 3;
+                    y3 = (y1 + (w2mw6 * y3)) >> 3;
+                    y1 = y4 + y6;
+                    y4 -= y6;
+                    y6 = y5 + y7;
+                    y5 -= y7;
+
+                    // Stage 3.
+                    y7 = y8 + y3;
+                    y8 -= y3;
+                    y3 = y0 + y2;
+                    y0 -= y2;
+                    y2 = ((r2 * (y4 + y5)) + 128) >> 8;
+                    y4 = ((r2 * (y4 - y5)) + 128) >> 8;
+
+                    // Stage 4.
+                    src[x] = (y7 + y1) >> 14;
+                    src[8 + x] = (y3 + y2) >> 14;
+                    src[16 + x] = (y0 + y4) >> 14;
+                    src[24 + x] = (y8 + y6) >> 14;
+                    src[32 + x] = (y8 - y6) >> 14;
+                    src[40 + x] = (y0 - y4) >> 14;
+                    src[48 + x] = (y3 - y2) >> 14;
+                    src[56 + x] = (y7 - y1) >> 14;
+                }
+            }
+        }
+
        /// <summary>
        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L200
        /// </summary>
@ -56,9 +361,11 @@ namespace ImageSharp.Tests.Formats.Jpg
            float a0, a1, a2, a3, b0, b1, b2, b3;
            float z0, z1, z2, z3, z4;

+            float r0 = 1.414214f;
            float r1 = 1.387040f;
            float r2 = 1.306563f;
            float r3 = 1.175876f;
+            float r4 = 1.000000f;
            float r5 = 0.785695f;
            float r6 = 0.541196f;
            float r7 = 0.275899f;
@ -130,6 +437,145 @@ namespace ImageSharp.Tests.Formats.Jpg
            }
        }

+        /// <summary>
+        /// Original:
+        /// <see>
+        ///     <cref>https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15</cref>
+        /// </see>
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void fDCT2D8x4_32f(MutableSpan<float> s, MutableSpan<float> d)
+        {
+            Vector4 c0 = _mm_load_ps(s, 0);
+            Vector4 c1 = _mm_load_ps(s, 56);
+            Vector4 t0 = (c0 + c1);
+            Vector4 t7 = (c0 - c1);
+
+            c1 = _mm_load_ps(s, 48);
+            c0 = _mm_load_ps(s, 8);
+            Vector4 t1 = (c0 + c1);
+            Vector4 t6 = (c0 - c1);
+
+            c1 = _mm_load_ps(s, 40);
+            c0 = _mm_load_ps(s, 16);
+            Vector4 t2 = (c0 + c1);
+            Vector4 t5 = (c0 - c1);
+
+            c0 = _mm_load_ps(s, 24);
+            c1 = _mm_load_ps(s, 32);
+            Vector4 t3 = (c0 + c1);
+            Vector4 t4 = (c0 - c1);
+
+            /*
+            c1 = x[0]; c2 = x[7]; t0 = c1 + c2; t7 = c1 - c2;
+            c1 = x[1]; c2 = x[6]; t1 = c1 + c2; t6 = c1 - c2;
+            c1 = x[2]; c2 = x[5]; t2 = c1 + c2; t5 = c1 - c2;
+            c1 = x[3]; c2 = x[4]; t3 = c1 + c2; t4 = c1 - c2;
+            */
+
+            c0 = (t0 + t3);
+            Vector4 c3 = (t0 - t3);
+            c1 = (t1 + t2);
+            Vector4 c2 = (t1 - t2);
+
+            /*
+	        c0 = t0 + t3; c3 = t0 - t3;
+	        c1 = t1 + t2; c2 = t1 - t2;
+	        */
+
+            _mm_store_ps(d, 0, (c0 + c1));
+
+            _mm_store_ps(d, 32, (c0 - c1));
+
+            /*y[0] = c0 + c1;
+            y[4] = c0 - c1;*/
+
+            Vector4 w0 = new Vector4(0.541196f);
+            Vector4 w1 = new Vector4(1.306563f);
+
+            _mm_store_ps(d, 16, ((w0 * c2) + (w1 * c3)));
+
+            _mm_store_ps(d, 48, ((w0 * c3) - (w1 * c2)));
+            /*
+            y[2] = c2 * r[6] + c3 * r[2];
+            y[6] = c3 * r[6] - c2 * r[2];
+            */
+
+            w0 = new Vector4(1.175876f);
+            w1 = new Vector4(0.785695f);
+            c3 = ((w0 * t4) + (w1 * t7));
+            c0 = ((w0 * t7) - (w1 * t4));
+            /*
+            c3 = t4 * r[3] + t7 * r[5];
+            c0 = t7 * r[3] - t4 * r[5];
+            */
+
+            w0 = new Vector4(1.387040f);
+            w1 = new Vector4(0.275899f);
+            c2 = ((w0 * t5) + (w1 * t6));
+            c1 = ((w0 * t6) - (w1 * t5));
+            /*
+	        c2 = t5 * r[1] + t6 * r[7];
+	        c1 = t6 * r[1] - t5 * r[7];
+	        */
+
+            _mm_store_ps(d, 24, (c0 - c2));
+
+            _mm_store_ps(d, 40, (c3 - c1));
+            //y[5] = c3 - c1; y[3] = c0 - c2;
+
+            Vector4 invsqrt2 = new Vector4(0.707107f);
+            c0 = ((c0 + c2) * invsqrt2);
+            c3 = ((c3 + c1) * invsqrt2);
+            //c0 = (c0 + c2) * invsqrt2;
+            //c3 = (c3 + c1) * invsqrt2;
+
+            _mm_store_ps(d, 8, (c0 + c3));
+
+            _mm_store_ps(d, 56, (c0 - c3));
+            //y[1] = c0 + c3; y[7] = c0 - c3;
+
+            /*for(i = 0;i < 8;i++)
+            { 
+            y[i] *= invsqrt2h; 
+            }*/
+        }
+
+        public static void fDCT8x8_llm_sse(MutableSpan<float> s, MutableSpan<float> d, MutableSpan<float> temp)
+        {
+            Transpose8x8(s, temp);
+
+            fDCT2D8x4_32f(temp, d);
+
+            fDCT2D8x4_32f(temp.Slice(4), d.Slice(4));
+
+            Transpose8x8(d, temp);
+
+            fDCT2D8x4_32f(temp, d);
+
+            fDCT2D8x4_32f(temp.Slice(4), d.Slice(4));
+
+            Vector4 c = new Vector4(0.1250f);
+
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//0
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//1
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//2
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//3
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//4
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//5
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//6
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//7
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//8
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//9
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//10
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//11
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//12
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//13
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//14
+            _mm_store_ps(d, 0, (_mm_load_ps(d, 0) * c)); d.AddOffset(4);//15
+        }
+
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static Vector4 _mm_load_ps(MutableSpan<float> src, int offset)
        {
@ -329,5 +775,96 @@ namespace ImageSharp.Tests.Formats.Jpg
                }
            }
        }
+
+        internal static void fDCT1Dllm_32f(MutableSpan<float> x, MutableSpan<float> y)
+        {
+            float t0, t1, t2, t3, t4, t5, t6, t7;
+            float c0, c1, c2, c3;
+            float[] r = new float[8];
+
+            //for(i = 0;i < 8;i++){ r[i] = (float)(cos((double)i / 16.0 * M_PI) * M_SQRT2); }
+            r[0] = 1.414214f;
+            r[1] = 1.387040f;
+            r[2] = 1.306563f;
+            r[3] = 1.175876f;
+            r[4] = 1.000000f;
+            r[5] = 0.785695f;
+            r[6] = 0.541196f;
+            r[7] = 0.275899f;
+
+            const float invsqrt2 = 0.707107f; //(float)(1.0f / M_SQRT2);
+            const float invsqrt2h = 0.353554f; //invsqrt2*0.5f;
+
+            c1 = x[0];
+            c2 = x[7];
+            t0 = c1 + c2;
+            t7 = c1 - c2;
+            c1 = x[1];
+            c2 = x[6];
+            t1 = c1 + c2;
+            t6 = c1 - c2;
+            c1 = x[2];
+            c2 = x[5];
+            t2 = c1 + c2;
+            t5 = c1 - c2;
+            c1 = x[3];
+            c2 = x[4];
+            t3 = c1 + c2;
+            t4 = c1 - c2;
+
+            c0 = t0 + t3;
+            c3 = t0 - t3;
+            c1 = t1 + t2;
+            c2 = t1 - t2;
+
+            y[0] = c0 + c1;
+            y[4] = c0 - c1;
+            y[2] = c2 * r[6] + c3 * r[2];
+            y[6] = c3 * r[6] - c2 * r[2];
+
+            c3 = t4 * r[3] + t7 * r[5];
+            c0 = t7 * r[3] - t4 * r[5];
+            c2 = t5 * r[1] + t6 * r[7];
+            c1 = t6 * r[1] - t5 * r[7];
+
+            y[5] = c3 - c1;
+            y[3] = c0 - c2;
+            c0 = (c0 + c2) * invsqrt2;
+            c3 = (c3 + c1) * invsqrt2;
+            y[1] = c0 + c3;
+            y[7] = c0 - c3;
+        }
+
+        internal static void fDCT2D_llm(
+            MutableSpan<float> s,
+            MutableSpan<float> d,
+            MutableSpan<float> temp,
+            bool downscaleBy8 = false,
+            bool offsetSourceByNeg128 = false)
+        {
+            MutableSpan<float> sWorker = offsetSourceByNeg128 ? s.AddScalarToAllValues(-128f) : s;
+            
+            for (int j = 0; j < 8; j++)
+            {
+                fDCT1Dllm_32f(sWorker.Slice(j * 8), temp.Slice(j * 8));
+            }
+
+            Transpose8x8(temp, d);
+
+            for (int j = 0; j < 8; j++)
+            {
+                fDCT1Dllm_32f(d.Slice(j * 8), temp.Slice(j * 8));
+            }
+
+            Transpose8x8(temp, d);
+            
+            if (downscaleBy8)
+            {
+                for (int j = 0; j < 64; j++)
+                {
+                    d[j] *= 0.125f;
+                }
+            }
+        }
    }
 }
--- a/tests/ImageSharp.Tests/Formats/Jpg/ReferenceImplementationsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/ReferenceImplementationsTests.cs
@ -0,0 +1,144 @@
+// ReSharper disable InconsistentNaming
+namespace ImageSharp.Tests.Formats.Jpg
+{
+    using System.Numerics;
+    using ImageSharp.Formats;
+    using Xunit;
+    using Xunit.Abstractions;
+
+    public class ReferenceImplementationsTests : UtilityTestClassBase
+    {
+        public ReferenceImplementationsTests(ITestOutputHelper output)
+            : base(output)
+        {
+        }
+
+
+        [Theory]
+        [InlineData(42)]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void Idct_FloatingPointReferenceImplementation_IsEquivalentToIntegerImplementation(int seed)
+        {
+            MutableSpan<int> intData = Create8x8RandomIntData(-200, 200, seed);
+            MutableSpan<float> floatSrc = intData.ConvertToFloat32MutableSpan();
+
+            ReferenceImplementations.IntegerReferenceDCT.TransformIDCTInplace(intData);
+            
+            MutableSpan<float> dest = new MutableSpan<float>(64);
+            MutableSpan<float> temp = new MutableSpan<float>(64);
+
+            ReferenceImplementations.iDCT2D_llm(floatSrc, dest, temp);
+
+            for (int i = 0; i < 64; i++)
+            {
+                float expected = intData[i];
+                float actual = dest[i];
+
+                Assert.Equal(expected, actual, new ApproximateFloatComparer(1f));
+            }
+        }
+
+        [Theory]
+        [InlineData(42, 0)]
+        [InlineData(1, 0)]
+        [InlineData(2, 0)]
+        public void IntegerDCT_ForwardThenInverse(int seed, int startAt)
+        {
+            MutableSpan<int> original = Create8x8RandomIntData(-200, 200, seed);
+
+            var block = original.AddScalarToAllValues(128);
+
+            ReferenceImplementations.IntegerReferenceDCT.TransformFDCTInplace(block);
+            
+            for (int i = 0; i < 64; i++)
+            {
+                block[i] /= 8;
+            }
+
+            ReferenceImplementations.IntegerReferenceDCT.TransformIDCTInplace(block);
+            
+            for (int i = startAt; i < 64; i++)
+            {
+                float expected = original[i];
+                float actual = (float)block[i];
+
+                Assert.Equal(expected, actual, new ApproximateFloatComparer(3f));
+            }
+
+        }
+
+        [Theory]
+        [InlineData(42, 0)]
+        [InlineData(1, 0)]
+        [InlineData(2, 0)]
+        public void FloatingPointDCT_ReferenceImplementation_ForwardThenInverse(int seed, int startAt)
+        {
+            var data = Create8x8RandomIntData(-200, 200, seed);
+            MutableSpan<float> src = new MutableSpan<int>(data).ConvertToFloat32MutableSpan();
+            MutableSpan<float> dest = new MutableSpan<float>(64);
+            MutableSpan<float> temp = new MutableSpan<float>(64);
+
+            ReferenceImplementations.fDCT2D_llm(src, dest, temp, true);
+            ReferenceImplementations.iDCT2D_llm(dest, src, temp);
+
+            for (int i = startAt; i < 64; i++)
+            {
+                float expected = data[i];
+                float actual = (float)src[i];
+                
+                Assert.Equal(expected, actual, new ApproximateFloatComparer(2f));
+            }
+        }
+
+        [Fact]
+        public void HowMuchIsTheFish()
+        {
+            Output.WriteLine(Vector<int>.Count.ToString());
+        }
+
+        [Theory]
+        [InlineData(42)]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void Fdct_FloatingPointReferenceImplementation_IsEquivalentToIntegerImplementation(int seed)
+        {
+            MutableSpan<int> intData = Create8x8RandomIntData(-200, 200, seed);
+            MutableSpan<float> floatSrc = intData.ConvertToFloat32MutableSpan();
+            
+            ReferenceImplementations.IntegerReferenceDCT.TransformFDCTInplace(intData);
+            
+            MutableSpan<float> dest = new MutableSpan<float>(64);
+            MutableSpan<float> temp = new MutableSpan<float>(64);
+
+            ReferenceImplementations.fDCT2D_llm(floatSrc, dest, temp, offsetSourceByNeg128: true);
+
+            for (int i = 0; i < 64; i++)
+            {
+                float expected = intData[i];
+                float actual = dest[i];
+
+                Assert.Equal(expected, actual, new ApproximateFloatComparer(1f));
+            }
+        }
+
+        [Theory]
+        [InlineData(42)]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void Fdct_SimdReferenceImplementation_IsEquivalentToFloatingPointReferenceImplementation(int seed)
+        {
+            Block classic = new Block() { Data = Create8x8RandomIntData(-200, 200, seed) };
+            MutableSpan<float> src = new MutableSpan<int>(classic.Data).ConvertToFloat32MutableSpan();
+
+            MutableSpan<float> dest1 = new MutableSpan<float>(64);
+            MutableSpan<float> dest2 = new MutableSpan<float>(64);
+            MutableSpan<float> temp = new MutableSpan<float>(64);
+
+            ReferenceImplementations.fDCT2D_llm(src, dest1, temp, downscaleBy8: true, offsetSourceByNeg128: false);
+            ReferenceImplementations.fDCT8x8_llm_sse(src, dest2, temp);
+            
+            Assert.Equal(dest1.Data, dest2.Data, new ApproximateFloatComparer(1f));
+        }
+    }
+}
--- a/tests/ImageSharp.Tests/Formats/Jpg/UtilityTestClassBase.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/UtilityTestClassBase.cs
@ -1,21 +1,51 @@
-using System;
-using System.Diagnostics;
-using System.Runtime.CompilerServices;
-using System.Text;
+using System.Text;
 using ImageSharp.Formats;
 using Xunit.Abstractions;
+// ReSharper disable InconsistentNaming

 namespace ImageSharp.Tests.Formats.Jpg
 {
-    public class UtilityTestClassBase
+    using System;
+    using System.Collections.Generic;
+    using System.Diagnostics;
+    using System.Runtime.CompilerServices;
+    
+    /// <summary>
+    /// Utility class to measure the execution of an operation.
+    /// </summary>
+    public class MeasureFixture
    {
-        public UtilityTestClassBase(ITestOutputHelper output)
+        protected bool EnablePrinting = true;
+
+        protected void Measure(int times, Action action, [CallerMemberName] string operationName = null)
+        {
+            if (this.EnablePrinting) this.Output?.WriteLine($"{operationName} X {times} ...");
+            Stopwatch sw = Stopwatch.StartNew();
+
+            for (int i = 0; i < times; i++)
+            {
+                action();
+            }
+
+            sw.Stop();
+            if (this.EnablePrinting) this.Output?.WriteLine($"{operationName} finished in {sw.ElapsedMilliseconds} ms");
+        }
+
+        public MeasureFixture(ITestOutputHelper output)
        {
-            Output = output;
+            this.Output = output;
        }

        protected ITestOutputHelper Output { get; }
+    }
+

+    public class UtilityTestClassBase : MeasureFixture
+    {
+        public UtilityTestClassBase(ITestOutputHelper output) : base(output)
+        {
+        }
+        
        // ReSharper disable once InconsistentNaming
        public static float[] Create8x8FloatData()
        {
@ -29,10 +59,7 @@ namespace ImageSharp.Tests.Formats.Jpg
            }
            return result;
        }
-
-      
-
-
+        
        // ReSharper disable once InconsistentNaming
        public static int[] Create8x8IntData()
        {
@ -47,7 +74,25 @@ namespace ImageSharp.Tests.Formats.Jpg
            return result;
        }

-        internal void Print8x8Data<T>(MutableSpan<T> data) => Print8x8Data(data.Data);
+        // ReSharper disable once InconsistentNaming
+        public static int[] Create8x8RandomIntData(int minValue, int maxValue, int seed = 42)
+        {
+            Random rnd = new Random(seed);
+            int[] result = new int[64];
+            for (int i = 0; i < 8; i++)
+            {
+                for (int j = 0; j < 8; j++)
+                {
+                    result[i * 8 + j] = rnd.Next(minValue, maxValue);
+                }
+            }
+            return result;
+        }
+
+        internal static MutableSpan<float> Create8x8RandomFloatData(int minValue, int maxValue, int seed = 42)
+            => new MutableSpan<int>(Create8x8RandomIntData(minValue, maxValue, seed)).ConvertToFloat32MutableSpan();
+
+        internal void Print8x8Data<T>(MutableSpan<T> data) => this.Print8x8Data(data.Data);

        internal void Print8x8Data<T>(T[] data)
        {
@ -61,10 +106,10 @@ namespace ImageSharp.Tests.Formats.Jpg
                bld.AppendLine();
            }

-            Output.WriteLine(bld.ToString());
+            this.Output.WriteLine(bld.ToString());
        }

-        internal void PrintLinearData<T>(T[] data) => PrintLinearData(new MutableSpan<T>(data), data.Length);
+        internal void PrintLinearData<T>(T[] data) => this.PrintLinearData(new MutableSpan<T>(data), data.Length);

        internal void PrintLinearData<T>(MutableSpan<T> data, int count = -1)
        {
@ -75,21 +120,35 @@ namespace ImageSharp.Tests.Formats.Jpg
            {
                bld.Append($"{data[i],3} ");
            }
-            Output.WriteLine(bld.ToString());
+            this.Output.WriteLine(bld.ToString());
        }

-        protected void Measure(int times, Action action, [CallerMemberName] string operationName = null)
+        internal struct ApproximateFloatComparer : IEqualityComparer<float>
        {
-            Output.WriteLine($"{operationName} X {times} ...");
-            Stopwatch sw = Stopwatch.StartNew();
+            private readonly float Eps;

-            for (int i = 0; i < times; i++)
+            public ApproximateFloatComparer(float eps = 1f)
            {
-                action();
+                this.Eps = eps;
            }

-            sw.Stop();
-            Output.WriteLine($"{operationName} finished in {sw.ElapsedMilliseconds} ms");
+            public bool Equals(float x, float y)
+            {
+                float d = x - y;
+
+                return d > -this.Eps && d < this.Eps;
+            }
+
+            public int GetHashCode(float obj)
+            {
+                throw new InvalidOperationException();
+            }
+        }
+
+        protected void Print(string msg)
+        {
+            Debug.WriteLine(msg);
+            this.Output.WriteLine(msg);
        }
    }
 }
--- a/tests/ImageSharp.Tests46/Formats/Jpg/ReferenceImplementationsTests.cs
+++ b/tests/ImageSharp.Tests46/Formats/Jpg/ReferenceImplementationsTests.cs
@ -0,0 +1,144 @@
+// ReSharper disable InconsistentNaming
+namespace ImageSharp.Tests.Formats.Jpg
+{
+    using System.Numerics;
+    using ImageSharp.Formats;
+    using Xunit;
+    using Xunit.Abstractions;
+
+    public class ReferenceImplementationsTests : UtilityTestClassBase
+    {
+        public ReferenceImplementationsTests(ITestOutputHelper output)
+            : base(output)
+        {
+        }
+
+
+        [Theory]
+        [InlineData(42)]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void Idct_FloatingPointReferenceImplementation_IsEquivalentToIntegerImplementation(int seed)
+        {
+            MutableSpan<int> intData = Create8x8RandomIntData(-200, 200, seed);
+            MutableSpan<float> floatSrc = intData.ConvertToFloat32MutableSpan();
+
+            ReferenceImplementations.IntegerReferenceDCT.TransformIDCTInplace(intData);
+            
+            MutableSpan<float> dest = new MutableSpan<float>(64);
+            MutableSpan<float> temp = new MutableSpan<float>(64);
+
+            ReferenceImplementations.iDCT2D_llm(floatSrc, dest, temp);
+
+            for (int i = 0; i < 64; i++)
+            {
+                float expected = intData[i];
+                float actual = dest[i];
+
+                Assert.Equal(expected, actual, new ApproximateFloatComparer(1f));
+            }
+        }
+
+        [Theory]
+        [InlineData(42, 0)]
+        [InlineData(1, 0)]
+        [InlineData(2, 0)]
+        public void IntegerDCT_ForwardThenInverse(int seed, int startAt)
+        {
+            MutableSpan<int> original = Create8x8RandomIntData(-200, 200, seed);
+
+            var block = original.AddScalarToAllValues(128);
+
+            ReferenceImplementations.IntegerReferenceDCT.TransformFDCTInplace(block);
+            
+            for (int i = 0; i < 64; i++)
+            {
+                block[i] /= 8;
+            }
+
+            ReferenceImplementations.IntegerReferenceDCT.TransformIDCTInplace(block);
+            
+            for (int i = startAt; i < 64; i++)
+            {
+                float expected = original[i];
+                float actual = (float)block[i];
+
+                Assert.Equal(expected, actual, new ApproximateFloatComparer(3f));
+            }
+
+        }
+
+        [Theory]
+        [InlineData(42, 0)]
+        [InlineData(1, 0)]
+        [InlineData(2, 0)]
+        public void FloatingPointDCT_ReferenceImplementation_ForwardThenInverse(int seed, int startAt)
+        {
+            var data = Create8x8RandomIntData(-200, 200, seed);
+            MutableSpan<float> src = new MutableSpan<int>(data).ConvertToFloat32MutableSpan();
+            MutableSpan<float> dest = new MutableSpan<float>(64);
+            MutableSpan<float> temp = new MutableSpan<float>(64);
+
+            ReferenceImplementations.fDCT2D_llm(src, dest, temp, true);
+            ReferenceImplementations.iDCT2D_llm(dest, src, temp);
+
+            for (int i = startAt; i < 64; i++)
+            {
+                float expected = data[i];
+                float actual = (float)src[i];
+                
+                Assert.Equal(expected, actual, new ApproximateFloatComparer(2f));
+            }
+        }
+
+        [Fact]
+        public void HowMuchIsTheFish()
+        {
+            Output.WriteLine(Vector<int>.Count.ToString());
+        }
+
+        [Theory]
+        [InlineData(42)]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void Fdct_FloatingPointReferenceImplementation_IsEquivalentToIntegerImplementation(int seed)
+        {
+            MutableSpan<int> intData = Create8x8RandomIntData(-200, 200, seed);
+            MutableSpan<float> floatSrc = intData.ConvertToFloat32MutableSpan();
+            
+            ReferenceImplementations.IntegerReferenceDCT.TransformFDCTInplace(intData);
+            
+            MutableSpan<float> dest = new MutableSpan<float>(64);
+            MutableSpan<float> temp = new MutableSpan<float>(64);
+
+            ReferenceImplementations.fDCT2D_llm(floatSrc, dest, temp, offsetSourceByNeg128: true);
+
+            for (int i = 0; i < 64; i++)
+            {
+                float expected = intData[i];
+                float actual = dest[i];
+
+                Assert.Equal(expected, actual, new ApproximateFloatComparer(1f));
+            }
+        }
+
+        [Theory]
+        [InlineData(42)]
+        [InlineData(1)]
+        [InlineData(2)]
+        public void Fdct_SimdReferenceImplementation_IsEquivalentToFloatingPointReferenceImplementation(int seed)
+        {
+            Block classic = new Block() { Data = Create8x8RandomIntData(-200, 200, seed) };
+            MutableSpan<float> src = new MutableSpan<int>(classic.Data).ConvertToFloat32MutableSpan();
+
+            MutableSpan<float> dest1 = new MutableSpan<float>(64);
+            MutableSpan<float> dest2 = new MutableSpan<float>(64);
+            MutableSpan<float> temp = new MutableSpan<float>(64);
+
+            ReferenceImplementations.fDCT2D_llm(src, dest1, temp, downscaleBy8: true, offsetSourceByNeg128: false);
+            ReferenceImplementations.fDCT8x8_llm_sse(src, dest2, temp);
+            
+            Assert.Equal(dest1.Data, dest2.Data, new ApproximateFloatComparer(1f));
+        }
+    }
+}
--- a/tests/ImageSharp.Tests46/ImageSharp.Tests46.csproj
+++ b/tests/ImageSharp.Tests46/ImageSharp.Tests46.csproj
@ -157,6 +157,7 @@
    <Compile Include="..\ImageSharp.Tests\TestUtilities\TestUtilityExtensions.cs">
      <Link>TestUtilities\TestUtilityExtensions.cs</Link>
    </Compile>
+    <Compile Include="Formats\Jpg\ReferenceImplementationsTests.cs" />
    <Compile Include="HelloTest.cs" />
    <Compile Include="Properties\AssemblyInfo.cs" />
    <Compile Include="TestFile.cs" />