From 656c45436490c62970f547906b1dfab22c777785 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Sun, 9 Jul 2017 17:33:26 +1000
Subject: [PATCH] Reduce allocations

---
 .../Formats/Jpeg/Port/Components/IDCT.cs      | 477 +++++++++---------
 .../Formats/Jpeg/Port/JpegDecoderCore.cs      |  20 +-
 2 files changed, 245 insertions(+), 252 deletions(-)
diff --git a/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs b/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs
index 65c2bbde60..064b3bea36 100644
--- a/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs
@@ -6,24 +6,38 @@
     using ImageSharp.Memory;
 
     /// <summary>
-    /// Performa the invers
+    /// Performs the inverse Descrete Cosine Transform on each frame component.
     /// </summary>
     internal static class IDCT
     {
-        private const int DctCos1 = 4017;   // cos(pi/16)
-        private const int DctSin1 = 799;   // sin(pi/16)
-        private const int DctCos3 = 3406;   // cos(3*pi/16)
-        private const int DctSin3 = 2276;   // sin(3*pi/16)
-        private const int DctCos6 = 1567;   // cos(6*pi/16)
-        private const int DctSin6 = 3784;   // sin(6*pi/16)
-        private const int DctSqrt2 = 5793;   // sqrt(2)
+        /// <summary>
+        /// Precomputed values scaled up by 14 bits
+        /// </summary>
+        public static readonly short[] Aanscales =
+        {
+            16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, 22725, 31521, 29692, 26722, 22725, 17855,
+            12299, 6270, 21407, 29692, 27969, 25172, 21407, 16819, 11585,
+            5906, 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
+            16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, 12873,
+            17855, 16819, 15137, 12873, 10114, 6967, 3552, 8867, 12299,
+            11585, 10426, 8867, 6967, 4799, 2446, 4520, 6270, 5906, 5315,
+            4520, 3552, 2446, 1247
+        };
+
+        private const int DctCos1 = 4017;     // cos(pi/16)
+        private const int DctSin1 = 799;      // sin(pi/16)
+        private const int DctCos3 = 3406;     // cos(3*pi/16)
+        private const int DctSin3 = 2276;     // sin(3*pi/16)
+        private const int DctCos6 = 1567;     // cos(6*pi/16)
+        private const int DctSin6 = 3784;     // sin(6*pi/16)
+        private const int DctSqrt2 = 5793;    // sqrt(2)
         private const int DctSqrt1D2 = 2896;  // sqrt(2) / 2
 
 #pragma warning disable SA1310 // Field names must not contain underscore
-        private const int FIX_1_082392200 = 277;        /* FIX(1.082392200) */
-        private const int FIX_1_414213562 = 362;        /* FIX(1.414213562) */
-        private const int FIX_1_847759065 = 473;        /* FIX(1.847759065) */
-        private const int FIX_2_613125930 = 669;        /* FIX(2.613125930) */
+        private const int FIX_1_082392200 = 277;        // FIX(1.082392200)
+        private const int FIX_1_414213562 = 362;        // FIX(1.414213562)
+        private const int FIX_1_847759065 = 473;        // FIX(1.847759065)
+        private const int FIX_2_613125930 = 669;        // FIX(2.613125930)
 #pragma warning restore SA1310 // Field names must not contain underscore
 
         private const int ConstBits = 8;
@@ -42,21 +56,9 @@
         // be quite far out of range if the input data is corrupt, so a bulletproof
         // range-limiting step is required.  We use a mask-and-table-lookup method
         // to do the combined operations quickly, assuming that MaxJSample+1
-        // is a power of 2.  See the comments with prepare_range_limit_table for more info.
+        // is a power of 2.
         private const int RangeMask = (MaxJSample * 4) + 3; // 2 bits wider than legal samples
 
-        // Precomputed values scaled up by 14 bits
-        private static readonly short[] Aanscales =
-        {
-            16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, 22725, 31521, 29692, 26722, 22725, 17855,
-            12299, 6270, 21407, 29692, 27969, 25172, 21407, 16819, 11585,
-            5906, 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
-            16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, 12873,
-            17855, 16819, 15137, 12873, 10114, 6967, 3552, 8867, 12299,
-            11585, 10426, 8867, 6967, 4799, 2446, 4520, 6270, 5906, 5315,
-            4520, 3552, 2446, 1247
-        };
-
         private static readonly byte[] Limit = new byte[5 * (MaxJSample + 1)];
 
         static IDCT()
@@ -81,15 +83,13 @@
         /// 'Practical Fast 1-D DCT Algorithms with 11 Multiplications',
         /// IEEE Intl. Conf. on Acoustics, Speech &amp; Signal Processing, 1989, 988-991.
         /// </summary>
-        /// <param name="quantizationTables">The quantization tables</param>
         /// <param name="component">The fram component</param>
         /// <param name="blockBufferOffset">The block buffer offset</param>
         /// <param name="computationBuffer">The computational buffer for holding temp values</param>
-        public static void QuantizeAndInverse(QuantizationTables quantizationTables, ref FrameComponent component, int blockBufferOffset, Buffer<short> computationBuffer)
+        /// <param name="quantizationTable">The quantization table</param>
+        public static void QuantizeAndInverse(ref FrameComponent component, int blockBufferOffset, ref Span<short> computationBuffer, ref Span<short> quantizationTable)
         {
-            Span<short> qt = quantizationTables.Tables.GetRowSpan(component.QuantizationIdentifier);
             Span<short> blockData = component.BlockData.Slice(blockBufferOffset);
-            Span<short> computationBufferSpan = computationBuffer;
             int v0, v1, v2, v3, v4, v5, v6, v7;
             int p0, p1, p2, p3, p4, p5, p6, p7;
             int t;
@@ -108,32 +108,32 @@
                 p7 = blockData[row + 7];
 
                 // dequant p0
-                p0 *= qt[row];
+                p0 *= quantizationTable[row];
 
                 // check for all-zero AC coefficients
                 if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0)
                 {
                     t = ((DctSqrt2 * p0) + 512) >> 10;
                     short st = (short)t;
-                    computationBufferSpan[row] = st;
-                    computationBufferSpan[row + 1] = st;
-                    computationBufferSpan[row + 2] = st;
-                    computationBufferSpan[row + 3] = st;
-                    computationBufferSpan[row + 4] = st;
-                    computationBufferSpan[row + 5] = st;
-                    computationBufferSpan[row + 6] = st;
-                    computationBufferSpan[row + 7] = st;
+                    computationBuffer[row] = st;
+                    computationBuffer[row + 1] = st;
+                    computationBuffer[row + 2] = st;
+                    computationBuffer[row + 3] = st;
+                    computationBuffer[row + 4] = st;
+                    computationBuffer[row + 5] = st;
+                    computationBuffer[row + 6] = st;
+                    computationBuffer[row + 7] = st;
                     continue;
                 }
 
                 // dequant p1 ... p7
-                p1 *= qt[row + 1];
-                p2 *= qt[row + 2];
-                p3 *= qt[row + 3];
-                p4 *= qt[row + 4];
-                p5 *= qt[row + 5];
-                p6 *= qt[row + 6];
-                p7 *= qt[row + 7];
+                p1 *= quantizationTable[row + 1];
+                p2 *= quantizationTable[row + 2];
+                p3 *= quantizationTable[row + 3];
+                p4 *= quantizationTable[row + 4];
+                p5 *= quantizationTable[row + 5];
+                p6 *= quantizationTable[row + 6];
+                p7 *= quantizationTable[row + 7];
 
                 // stage 4
                 v0 = ((DctSqrt2 * p0) + 128) >> 8;
@@ -169,27 +169,27 @@
                 v6 = t;
 
                 // stage 1
-                computationBufferSpan[row] = (short)(v0 + v7);
-                computationBufferSpan[row + 7] = (short)(v0 - v7);
-                computationBufferSpan[row + 1] = (short)(v1 + v6);
-                computationBufferSpan[row + 6] = (short)(v1 - v6);
-                computationBufferSpan[row + 2] = (short)(v2 + v5);
-                computationBufferSpan[row + 5] = (short)(v2 - v5);
-                computationBufferSpan[row + 3] = (short)(v3 + v4);
-                computationBufferSpan[row + 4] = (short)(v3 - v4);
+                computationBuffer[row] = (short)(v0 + v7);
+                computationBuffer[row + 7] = (short)(v0 - v7);
+                computationBuffer[row + 1] = (short)(v1 + v6);
+                computationBuffer[row + 6] = (short)(v1 - v6);
+                computationBuffer[row + 2] = (short)(v2 + v5);
+                computationBuffer[row + 5] = (short)(v2 - v5);
+                computationBuffer[row + 3] = (short)(v3 + v4);
+                computationBuffer[row + 4] = (short)(v3 - v4);
             }
 
             // inverse DCT on columns
             for (int col = 0; col < 8; ++col)
             {
-                p0 = computationBufferSpan[col];
-                p1 = computationBufferSpan[col + 8];
-                p2 = computationBufferSpan[col + 16];
-                p3 = computationBufferSpan[col + 24];
-                p4 = computationBufferSpan[col + 32];
-                p5 = computationBufferSpan[col + 40];
-                p6 = computationBufferSpan[col + 48];
-                p7 = computationBufferSpan[col + 56];
+                p0 = computationBuffer[col];
+                p1 = computationBuffer[col + 8];
+                p2 = computationBuffer[col + 16];
+                p3 = computationBuffer[col + 24];
+                p4 = computationBuffer[col + 32];
+                p5 = computationBuffer[col + 40];
+                p6 = computationBuffer[col + 48];
+                p7 = computationBuffer[col + 56];
 
                 // check for all-zero AC coefficients
                 if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0)
@@ -302,195 +302,188 @@
         /// precise the scaled value, so this implementation does worse with high -
         /// quality - setting files than with low - quality ones.
         /// </summary>
-        /// <param name="quantizationTables">The quantization tables</param>
-        /// <param name="component">The fram component</param>
+        /// <param name="component">The frame component</param>
         /// <param name="blockBufferOffset">The block buffer offset</param>
         /// <param name="computationBuffer">The computational buffer for holding temp values</param>
-        public static void QuantizeAndInverseAlt(
-            QuantizationTables quantizationTables,
-            ref FrameComponent component,
-            int blockBufferOffset,
-            Buffer<short> computationBuffer)
+        /// <param name="multiplierTable">The multiplier table</param>
+        public static void QuantizeAndInverseFast(ref FrameComponent component, int blockBufferOffset, ref Span<short> computationBuffer, ref Span<short> multiplierTable)
         {
-            Span<short> qt = quantizationTables.Tables.GetRowSpan(component.QuantizationIdentifier);
             Span<short> blockData = component.BlockData.Slice(blockBufferOffset);
-            Span<short> computationBufferSpan = computationBuffer;
-
-            // For AA&N IDCT method, multiplier are equal to quantization
-            // coefficients scaled by scalefactor[row]*scalefactor[col], where
-            //   scalefactor[0] = 1
-            //   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
-            // For integer operation, the multiplier table is to be scaled by 14.
-            using (var multiplier = new Buffer<short>(64))
+            int p0, p1, p2, p3, p4, p5, p6, p7;
+
+            for (int col = 0; col < 8; col++)
             {
-                Span<short> multiplierSpan = multiplier;
-                for (int i = 0; i < 64; i++)
+                // Gather block data
+                p0 = blockData[col];
+                p1 = blockData[col + 8];
+                p2 = blockData[col + 16];
+                p3 = blockData[col + 24];
+                p4 = blockData[col + 32];
+                p5 = blockData[col + 40];
+                p6 = blockData[col + 48];
+                p7 = blockData[col + 56];
+
+                int tmp0 = p0 * multiplierTable[col];
+
+                // Due to quantization, we will usually find that many of the input
+                // coefficients are zero, especially the AC terms.  We can exploit this
+                // by short-circuiting the IDCT calculation for any column in which all
+                // the AC terms are zero.  In that case each output is equal to the
+                // DC coefficient (with scale factor as needed).
+                // With typical images and quantization tables, half or more of the
+                // column DCT calculations can be simplified this way.
+                if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0)
                 {
-                    multiplierSpan[i] = (short)Descale(qt[i] * Aanscales[i], 14 - Pass1Bits);
-                }
+                    short dcval = (short)tmp0;
 
-                int p0, p1, p2, p3, p4, p5, p6, p7;
+                    computationBuffer[col] = dcval;
+                    computationBuffer[col + 8] = dcval;
+                    computationBuffer[col + 16] = dcval;
+                    computationBuffer[col + 24] = dcval;
+                    computationBuffer[col + 32] = dcval;
+                    computationBuffer[col + 40] = dcval;
+                    computationBuffer[col + 48] = dcval;
+                    computationBuffer[col + 56] = dcval;
 
-                for (int col = 0; col < 8; col++)
-                {
-                    // Gather block data
-                    p0 = blockData[col];
-                    p1 = blockData[col + 8];
-                    p2 = blockData[col + 16];
-                    p3 = blockData[col + 24];
-                    p4 = blockData[col + 32];
-                    p5 = blockData[col + 40];
-                    p6 = blockData[col + 48];
-                    p7 = blockData[col + 56];
-
-                    int tmp0 = p0 * multiplierSpan[col];
-
-                    // Due to quantization, we will usually find that many of the input
-                    // coefficients are zero, especially the AC terms.  We can exploit this
-                    // by short-circuiting the IDCT calculation for any column in which all
-                    // the AC terms are zero.  In that case each output is equal to the
-                    // DC coefficient (with scale factor as needed).
-                    // With typical images and quantization tables, half or more of the
-                    // column DCT calculations can be simplified this way.
-                    if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0)
-                    {
-                        short dcval = (short)tmp0;
-
-                        computationBufferSpan[col] = dcval;
-                        computationBufferSpan[col + 8] = dcval;
-                        computationBufferSpan[col + 16] = dcval;
-                        computationBufferSpan[col + 24] = dcval;
-                        computationBufferSpan[col + 32] = dcval;
-                        computationBufferSpan[col + 40] = dcval;
-                        computationBufferSpan[col + 48] = dcval;
-                        computationBufferSpan[col + 56] = dcval;
-
-                        continue;
-                    }
-
-                    // Even part
-                    int tmp1 = p2 * multiplierSpan[col + 16];
-                    int tmp2 = p4 * multiplierSpan[col + 32];
-                    int tmp3 = p6 * multiplierSpan[col + 48];
-
-                    int tmp10 = tmp0 + tmp2; // Phase 3
-                    int tmp11 = tmp0 - tmp2;
-
-                    int tmp13 = tmp1 + tmp3; // Phases 5-3
-                    int tmp12 = Multiply(tmp1 - tmp3, FIX_1_414213562) - tmp13; // 2*c4
-
-                    tmp0 = tmp10 + tmp13; // Phase 2
-                    tmp3 = tmp10 - tmp13;
-                    tmp1 = tmp11 + tmp12;
-                    tmp2 = tmp11 - tmp12;
-
-                    // Odd Part
-                    int tmp4 = p1 * multiplierSpan[col + 8];
-                    int tmp5 = p3 * multiplierSpan[col + 24];
-                    int tmp6 = p5 * multiplierSpan[col + 40];
-                    int tmp7 = p7 * multiplierSpan[col + 56];
-
-                    int z13 = tmp6 + tmp5; // Phase 6
-                    int z10 = tmp6 - tmp5;
-                    int z11 = tmp4 + tmp7;
-                    int z12 = tmp4 - tmp7;
-
-                    tmp7 = z11 + z13; // Phase 5
-                    tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4
-
-                    int z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2
-                    tmp10 = z5 - Multiply(z12, FIX_1_082392200); // 2*(c2-c6)
-                    tmp12 = z5 - Multiply(z10, FIX_2_613125930); // 2*(c2+c6)
-
-                    tmp6 = tmp12 - tmp7; // Phase 2
-                    tmp5 = tmp11 - tmp6;
-                    tmp4 = tmp10 - tmp5;
-
-                    computationBufferSpan[col] = (short)(tmp0 + tmp7);
-                    computationBufferSpan[col + 56] = (short)(tmp0 - tmp7);
-                    computationBufferSpan[col + 8] = (short)(tmp1 + tmp6);
-                    computationBufferSpan[col + 48] = (short)(tmp1 - tmp6);
-                    computationBufferSpan[col + 16] = (short)(tmp2 + tmp5);
-                    computationBufferSpan[col + 40] = (short)(tmp2 - tmp5);
-                    computationBufferSpan[col + 24] = (short)(tmp3 + tmp4);
-                    computationBufferSpan[col + 32] = (short)(tmp3 - tmp4);
+                    continue;
                 }
 
-                // Pass 2: process rows from work array, store into output array.
-                // Note that we must descale the results by a factor of 8 == 2**3,
-                // and also undo the pass 1 bits scaling.
-                for (int row = 0; row < 64; row += 8)
+                // Even part
+                int tmp1 = p2 * multiplierTable[col + 16];
+                int tmp2 = p4 * multiplierTable[col + 32];
+                int tmp3 = p6 * multiplierTable[col + 48];
+
+                int tmp10 = tmp0 + tmp2; // Phase 3
+                int tmp11 = tmp0 - tmp2;
+
+                int tmp13 = tmp1 + tmp3; // Phases 5-3
+                int tmp12 = Multiply(tmp1 - tmp3, FIX_1_414213562) - tmp13; // 2*c4
+
+                tmp0 = tmp10 + tmp13; // Phase 2
+                tmp3 = tmp10 - tmp13;
+                tmp1 = tmp11 + tmp12;
+                tmp2 = tmp11 - tmp12;
+
+                // Odd Part
+                int tmp4 = p1 * multiplierTable[col + 8];
+                int tmp5 = p3 * multiplierTable[col + 24];
+                int tmp6 = p5 * multiplierTable[col + 40];
+                int tmp7 = p7 * multiplierTable[col + 56];
+
+                int z13 = tmp6 + tmp5; // Phase 6
+                int z10 = tmp6 - tmp5;
+                int z11 = tmp4 + tmp7;
+                int z12 = tmp4 - tmp7;
+
+                tmp7 = z11 + z13; // Phase 5
+                tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4
+
+                int z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2
+                tmp10 = z5 - Multiply(z12, FIX_1_082392200); // 2*(c2-c6)
+                tmp12 = z5 - Multiply(z10, FIX_2_613125930); // 2*(c2+c6)
+
+                tmp6 = tmp12 - tmp7; // Phase 2
+                tmp5 = tmp11 - tmp6;
+                tmp4 = tmp10 - tmp5;
+
+                computationBuffer[col] = (short)(tmp0 + tmp7);
+                computationBuffer[col + 56] = (short)(tmp0 - tmp7);
+                computationBuffer[col + 8] = (short)(tmp1 + tmp6);
+                computationBuffer[col + 48] = (short)(tmp1 - tmp6);
+                computationBuffer[col + 16] = (short)(tmp2 + tmp5);
+                computationBuffer[col + 40] = (short)(tmp2 - tmp5);
+                computationBuffer[col + 24] = (short)(tmp3 + tmp4);
+                computationBuffer[col + 32] = (short)(tmp3 - tmp4);
+            }
+
+            // Pass 2: process rows from work array, store into output array.
+            // Note that we must descale the results by a factor of 8 == 2**3,
+            // and also undo the pass 1 bits scaling.
+            for (int row = 0; row < 64; row += 8)
+            {
+                p1 = computationBuffer[row + 1];
+                p2 = computationBuffer[row + 2];
+                p3 = computationBuffer[row + 3];
+                p4 = computationBuffer[row + 4];
+                p5 = computationBuffer[row + 5];
+                p6 = computationBuffer[row + 6];
+                p7 = computationBuffer[row + 7];
+
+                // Add range center and fudge factor for final descale and range-limit.
+                int z5 = computationBuffer[row] + (RangeCenter << (Pass1Bits + 3)) + (1 << (Pass1Bits + 2));
+
+                // Check for all-zero AC coefficients
+                if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0)
                 {
-                    p1 = computationBufferSpan[row + 1];
-                    p2 = computationBufferSpan[row + 2];
-                    p3 = computationBufferSpan[row + 3];
-                    p4 = computationBufferSpan[row + 4];
-                    p5 = computationBufferSpan[row + 5];
-                    p6 = computationBufferSpan[row + 6];
-                    p7 = computationBufferSpan[row + 7];
-
-                    // Add range center and fudge factor for final descale and range-limit.
-                    int z5 = computationBufferSpan[row] + (RangeCenter << (Pass1Bits + 3)) + (1 << (Pass1Bits + 2));
-
-                    // Check for all-zero AC coefficients
-                    if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0)
-                    {
-                        byte dcval = Limit[LimitOffset + (RightShift(z5, Pass1Bits + 3) & RangeMask)];
-
-                        blockData[row] = dcval;
-                        blockData[row + 1] = dcval;
-                        blockData[row + 2] = dcval;
-                        blockData[row + 3] = dcval;
-                        blockData[row + 4] = dcval;
-                        blockData[row + 5] = dcval;
-                        blockData[row + 6] = dcval;
-                        blockData[row + 7] = dcval;
-
-                        continue;
-                    }
-
-                    // Even part
-                    int tmp10 = z5 + p4;
-                    int tmp11 = z5 - p4;
-
-                    int tmp13 = p2 + p6;
-                    int tmp12 = Multiply(p2 - p6, FIX_1_414213562) - tmp13; // 2*c4
-
-                    int tmp0 = tmp10 + tmp13;
-                    int tmp3 = tmp10 - tmp13;
-                    int tmp1 = tmp11 + tmp12;
-                    int tmp2 = tmp11 - tmp12;
-
-                    // Odd part
-                    int z13 = p5 + p3;
-                    int z10 = p5 - p3;
-                    int z11 = p1 + p7;
-                    int z12 = p1 - p7;
-
-                    int tmp7 = z11 + z13; // Phase 5
-                    tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4
-
-                    z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2
-                    tmp10 = z5 - Multiply(z12, FIX_1_082392200); // 2*(c2-c6)
-                    tmp12 = z5 - Multiply(z10, FIX_2_613125930); // 2*(c2+c6)
-
-                    int tmp6 = tmp12 - tmp7; // Phase 2
-                    int tmp5 = tmp11 - tmp6;
-                    int tmp4 = tmp10 - tmp5;
-
-                    // Final output stage: scale down by a factor of 8, offset, and range-limit
-                    blockData[row] = Limit[LimitOffset + (RightShift(tmp0 + tmp7, Pass1Bits + 3) & RangeMask)];
-                    blockData[row + 7] = Limit[LimitOffset + (RightShift(tmp0 - tmp7, Pass1Bits + 3) & RangeMask)];
-                    blockData[row + 1] = Limit[LimitOffset + (RightShift(tmp1 + tmp6, Pass1Bits + 3) & RangeMask)];
-                    blockData[row + 6] = Limit[LimitOffset + (RightShift(tmp1 - tmp6, Pass1Bits + 3) & RangeMask)];
-                    blockData[row + 2] = Limit[LimitOffset + (RightShift(tmp2 + tmp5, Pass1Bits + 3) & RangeMask)];
-                    blockData[row + 5] = Limit[LimitOffset + (RightShift(tmp2 - tmp5, Pass1Bits + 3) & RangeMask)];
-                    blockData[row + 3] = Limit[LimitOffset + (RightShift(tmp3 + tmp4, Pass1Bits + 3) & RangeMask)];
-                    blockData[row + 4] = Limit[LimitOffset + (RightShift(tmp3 - tmp4, Pass1Bits + 3) & RangeMask)];
+                    byte dcval = Limit[LimitOffset + (RightShift(z5, Pass1Bits + 3) & RangeMask)];
+
+                    blockData[row] = dcval;
+                    blockData[row + 1] = dcval;
+                    blockData[row + 2] = dcval;
+                    blockData[row + 3] = dcval;
+                    blockData[row + 4] = dcval;
+                    blockData[row + 5] = dcval;
+                    blockData[row + 6] = dcval;
+                    blockData[row + 7] = dcval;
+
+                    continue;
                 }
+
+                // Even part
+                int tmp10 = z5 + p4;
+                int tmp11 = z5 - p4;
+
+                int tmp13 = p2 + p6;
+                int tmp12 = Multiply(p2 - p6, FIX_1_414213562) - tmp13; // 2*c4
+
+                int tmp0 = tmp10 + tmp13;
+                int tmp3 = tmp10 - tmp13;
+                int tmp1 = tmp11 + tmp12;
+                int tmp2 = tmp11 - tmp12;
+
+                // Odd part
+                int z13 = p5 + p3;
+                int z10 = p5 - p3;
+                int z11 = p1 + p7;
+                int z12 = p1 - p7;
+
+                int tmp7 = z11 + z13; // Phase 5
+                tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4
+
+                z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2
+                tmp10 = z5 - Multiply(z12, FIX_1_082392200); // 2*(c2-c6)
+                tmp12 = z5 - Multiply(z10, FIX_2_613125930); // 2*(c2+c6)
+
+                int tmp6 = tmp12 - tmp7; // Phase 2
+                int tmp5 = tmp11 - tmp6;
+                int tmp4 = tmp10 - tmp5;
+
+                // Final output stage: scale down by a factor of 8, offset, and range-limit
+                blockData[row] = Limit[LimitOffset + (RightShift(tmp0 + tmp7, Pass1Bits + 3) & RangeMask)];
+                blockData[row + 7] = Limit[LimitOffset + (RightShift(tmp0 - tmp7, Pass1Bits + 3) & RangeMask)];
+                blockData[row + 1] = Limit[LimitOffset + (RightShift(tmp1 + tmp6, Pass1Bits + 3) & RangeMask)];
+                blockData[row + 6] = Limit[LimitOffset + (RightShift(tmp1 - tmp6, Pass1Bits + 3) & RangeMask)];
+                blockData[row + 2] = Limit[LimitOffset + (RightShift(tmp2 + tmp5, Pass1Bits + 3) & RangeMask)];
+                blockData[row + 5] = Limit[LimitOffset + (RightShift(tmp2 - tmp5, Pass1Bits + 3) & RangeMask)];
+                blockData[row + 3] = Limit[LimitOffset + (RightShift(tmp3 + tmp4, Pass1Bits + 3) & RangeMask)];
+                blockData[row + 4] = Limit[LimitOffset + (RightShift(tmp3 - tmp4, Pass1Bits + 3) & RangeMask)];
             }
         }
 
+        /// <summary>
+        /// Descale and correctly round an int value that's scaled by <paramref name="n"/> bits.
+        /// We assume <see cref="RightShift"/> rounds towards minus infinity, so adding
+        /// the fudge factor is correct for either sign of <paramref name="value"/>.
+        /// </summary>
+        /// <param name="value">The value</param>
+        /// <param name="n">The number of bits</param>
+        /// <returns>The <see cref="int"/></returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int Descale(int value, int n)
+        {
+            return RightShift(value + (1 << (n - 1)), n);
+        }
+
         /// <summary>
         /// Multiply a variable by an int constant, and immediately descale.
         /// </summary>
@@ -514,19 +507,5 @@
         {
             return value >> shift;
         }
-
-        /// <summary>
-        /// Descale and correctly round an int value that's scaled by <paramref name="n"/> bits.
-        /// We assume <see cref="RightShift"/> rounds towards minus infinity, so adding
-        /// the fudge factor is correct for either sign of <paramref name="value"/>.
-        /// </summary>
-        /// <param name="value">The value</param>
-        /// <param name="n">The number of bits</param>
-        /// <returns>The <see cref="int"/></returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static int Descale(int value, int n)
-        {
-            return RightShift(value + (1 << (n - 1)), n);
-        }
     }
 }
\ No newline at end of file
diff --git a/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs
index ef49dfaf06..074ee3cfdc 100644
--- a/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs
@@ -792,13 +792,28 @@ namespace ImageSharp.Formats.Jpeg.Port
             int blocksPerLine = component.BlocksPerLine;
             int blocksPerColumn = component.BlocksPerColumn;
             using (var computationBuffer = Buffer<short>.CreateClean(64))
-            {
+            using (var multiplicationBuffer = Buffer<short>.CreateClean(64))
+            {
+                Span<short> quantizationTable = this.quantizationTables.Tables.GetRowSpan(frameComponent.QuantizationIdentifier);
+                Span<short> computationBufferSpan = computationBuffer;
+
+                // For AA&N IDCT method, multiplier are equal to quantization
+                // coefficients scaled by scalefactor[row]*scalefactor[col], where
+                //   scalefactor[0] = 1
+                //   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+                // For integer operation, the multiplier table is to be scaled by 12.
+                Span<short> multiplierSpan = multiplicationBuffer;
+                for (int i = 0; i < 64; i++)
+                {
+                    multiplierSpan[i] = (short)IDCT.Descale(quantizationTable[i] * IDCT.Aanscales[i], 12);
+                }
+
                 for (int blockRow = 0; blockRow < blocksPerColumn; blockRow++)
                 {
                     for (int blockCol = 0; blockCol < blocksPerLine; blockCol++)
                     {
                         int offset = GetBlockBufferOffset(ref component, blockRow, blockCol);
-                        IDCT.QuantizeAndInverseAlt(this.quantizationTables, ref frameComponent, offset, computationBuffer);
+                        IDCT.QuantizeAndInverseFast(ref frameComponent, offset, ref computationBufferSpan, ref multiplierSpan);
                     }
                 }
             }
@@ -808,7 +823,6 @@ namespace ImageSharp.Formats.Jpeg.Port
 
         /// <summary>
         /// Builds the huffman tables
-        /// TODO: This is our bottleneck. We should use a faster algorithm with a LUT.
         /// </summary>
         /// <param name="tables">The tables</param>
         /// <param name="index">The table index</param>