Add WIP faster IDCT method

This is killing me!
9 years ago · 738fc202a7
2 changed files with 248 additions and 2 deletions
--- a/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs
@ -1,6 +1,9 @@
 namespace ImageSharp.Formats.Jpeg.Port.Components
 {
    using System;
+    using System.Numerics;
+    using System.Runtime.CompilerServices;
+
    using ImageSharp.Memory;

    /// <summary>
@ -17,6 +20,47 @@
        private const int DctSqrt2 = 5793;   // sqrt(2)
        private const int DctSqrt1D2 = 2896;  // sqrt(2) / 2

+#pragma warning disable SA1310 // Field names must not contain underscore
+        private const int FIX_1_082392200 = 277;        /* FIX(1.082392200) */
+        private const int FIX_1_414213562 = 362;        /* FIX(1.414213562) */
+        private const int FIX_1_847759065 = 473;        /* FIX(1.847759065) */
+        private const int FIX_2_613125930 = 669;        /* FIX(2.613125930) */
+#pragma warning restore SA1310 // Field names must not contain underscore
+
+        private const int ScaleBits = 2; /* fractional bits in scale factors */
+
+        /*
+        * Each IDCT routine is responsible for range-limiting its results and
+        * converting them to unsigned form (0..255).  The raw outputs could
+        * be quite far out of range if the input data is corrupt, so a bulletproof
+        * range-limiting step is required.  We use a mask-and-table-lookup method
+        * to do the combined operations quickly, assuming that 255+1
+        * is a power of 2.  See the comments with prepare_range_limit_table for more info.
+        */
+        private const int RangeMask = (255 * 4) + 3; /* 2 bits wider than legal samples */
+
+        private static readonly byte[] Limit = new byte[5 * (255 + 1)];
+
+        static IDCT()
+        {
+            // First segment of range limit table: limit[x] = 0 for x < 0
+            // allow negative subscripts of simple table */
+            int tableOffset = 2 * (255 + 1);
+
+            // Main part of range limit table: limit[x] = x
+            int i;
+            for (i = 0; i <= 255; i++)
+            {
+                Limit[tableOffset + i] = (byte)i;
+            }
+
+            /* End of range limit table: limit[x] = MAXJSAMPLE for x > MAXJSAMPLE */
+            for (; i < 3 * (255 + 1); i++)
+            {
+                Limit[tableOffset + i] = 255;
+            }
+        }
+
        /// <summary>
        /// A port of Poppler's IDCT method which in turn is taken from:
        /// Christoph Loeffler, Adriaan Ligtenberg, George S. Moschytz,
@ -219,5 +263,207 @@
                blockData[col + 56] = (short)p7;
            }
        }
+
+        /// <summary>
+        /// A port of <see href="https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/jidctfst.c#L171"/>
+        /// TODO: This does not work!!
+        /// A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
+        /// on each row(or vice versa, but it's more convenient to emit a row at
+        /// a time).  Direct algorithms are also available, but they are much more
+        /// complex and seem not to be any faster when reduced to code.
+        ///
+        /// This implementation is based on Arai, Agui, and Nakajima's algorithm for
+        /// scaled DCT.Their original paper (Trans.IEICE E-71(11):1095) is in
+        /// Japanese, but the algorithm is described in the Pennebaker & Mitchell
+        /// JPEG textbook(see REFERENCES section in file README.ijg).  The following
+        /// code is based directly on figure 4-8 in P&amp;M.
+        /// While an 8-point DCT cannot be done in less than 11 multiplies, it is
+        /// possible to arrange the computation so that many of the multiplies are
+        /// simple scalings of the final outputs.These multiplies can then be
+        /// folded into the multiplications or divisions by the JPEG quantization
+        /// table entries.  The AA&amp;N method leaves only 5 multiplies and 29 adds
+        /// to be done in the DCT itself.
+        /// The primary disadvantage of this method is that with fixed-point math,
+        /// accuracy is lost due to imprecise representation of the scaled
+        /// quantization values.The smaller the quantization table entry, the less
+        /// precise the scaled value, so this implementation does worse with high -
+        /// quality - setting files than with low - quality ones.
+        /// </summary>
+        /// <param name="quantizationTables">The quantization tables</param>
+        /// <param name="component">The fram component</param>
+        /// <param name="blockBufferOffset">The block buffer offset</param>
+        /// <param name="computationBuffer">The computational buffer for holding temp values</param>
+        public static void QuantizeAndInverseAlt(QuantizationTables quantizationTables, ref FrameComponent component, int blockBufferOffset, Buffer<short> computationBuffer)
+        {
+            Span<short> qt = quantizationTables.Tables.GetRowSpan(component.QuantizationIdentifier);
+            Span<short> blockData = component.BlockData.Slice(blockBufferOffset);
+            Span<short> computationBufferSpan = computationBuffer;
+
+            int p0, p1, p2, p3, p4, p5, p6, p7;
+
+            for (int col = 0; col < 8; col++)
+            {
+                // Gather block data
+                p0 = blockData[col];
+                p1 = blockData[col + 8];
+                p2 = blockData[col + 16];
+                p3 = blockData[col + 24];
+                p4 = blockData[col + 32];
+                p5 = blockData[col + 40];
+                p6 = blockData[col + 48];
+                p7 = blockData[col + 56];
+
+                int tmp0 = p0 * qt[col];
+
+                // Check for all-zero AC coefficients
+                if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0)
+                {
+                    short dcval = (short)tmp0;
+
+                    computationBufferSpan[col] = dcval;
+                    computationBufferSpan[col + 8] = dcval;
+                    computationBufferSpan[col + 16] = dcval;
+                    computationBufferSpan[col + 24] = dcval;
+                    computationBufferSpan[col + 32] = dcval;
+                    computationBufferSpan[col + 40] = dcval;
+                    computationBufferSpan[col + 48] = dcval;
+                    computationBufferSpan[col + 56] = dcval;
+                    continue;
+                }
+
+                // Even part
+                int tmp1 = p2 * qt[col + 16];
+                int tmp2 = p4 * qt[col + 32];
+                int tmp3 = p6 * qt[col + 48];
+
+                int tmp10 = tmp0 + tmp2;    // Phase 3
+                int tmp11 = tmp0 - tmp2;
+
+                int tmp13 = tmp1 + tmp3;    // Phases 5-3
+                int tmp12 = Multiply(tmp1 - tmp3, FIX_1_414213562) - tmp13; // 2*c4
+
+                tmp0 = tmp10 + tmp13;   // Phase 2
+                tmp3 = tmp10 - tmp13;
+                tmp1 = tmp11 + tmp12;
+                tmp2 = tmp11 - tmp12;
+
+                // Odd Part
+                int tmp4 = p1 * qt[col + 8];
+                int tmp5 = p3 * qt[col + 24];
+                int tmp6 = p5 * qt[col + 40];
+                int tmp7 = p7 * qt[col + 56];
+
+                int z13 = tmp6 + tmp5;      // Phase 6
+                int z10 = tmp6 - tmp5;
+                int z11 = tmp4 + tmp7;
+                int z12 = tmp4 - tmp7;
+
+                tmp7 = z11 + z13;       // Phase 5
+                tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4
+
+                int z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2
+                tmp10 = Multiply(z12, FIX_1_082392200) - z5; // 2*(c2-c6)
+                tmp12 = Multiply(z10, FIX_2_613125930) + z5; // 2*(c2+c6)
+
+                tmp6 = tmp12 - tmp7;    // Phase 2
+                tmp5 = tmp11 - tmp6;
+                tmp4 = tmp10 - tmp5;
+
+                computationBufferSpan[col] = (short)(tmp0 + tmp7);
+                computationBufferSpan[col + 56] = (short)(tmp0 - tmp7);
+                computationBufferSpan[col + 8] = (short)(tmp1 + tmp6);
+                computationBufferSpan[col + 48] = (short)(tmp1 - tmp6);
+                computationBufferSpan[col + 16] = (short)(tmp2 + tmp5);
+                computationBufferSpan[col + 40] = (short)(tmp2 - tmp5);
+                computationBufferSpan[col + 32] = (short)(tmp3 + tmp4);
+                computationBufferSpan[col + 24] = (short)(tmp3 - tmp4);
+            }
+
+            // Pass 2: process rows from work array, store into output array.
+            // Note that we must descale the results by a factor of 8 == 2**3,
+            // and also undo the pass 1 bits scaling.
+            for (int row = 0; row < 64; row += 8)
+            {
+                p0 = computationBufferSpan[row];
+                p1 = computationBufferSpan[row + 1];
+                p2 = computationBufferSpan[row + 2];
+                p3 = computationBufferSpan[row + 3];
+                p4 = computationBufferSpan[row + 4];
+                p5 = computationBufferSpan[row + 5];
+                p6 = computationBufferSpan[row + 6];
+                p7 = computationBufferSpan[row + 7];
+
+                // Check for all-zero AC coefficients
+                if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0)
+                {
+                    byte dcval = Limit[Descale(p0, ScaleBits + 3) & RangeMask];
+
+                    blockData[row] = dcval;
+                    blockData[row + 1] = dcval;
+                    blockData[row + 2] = dcval;
+                    blockData[row + 3] = dcval;
+                    blockData[row + 4] = dcval;
+                    blockData[row + 5] = dcval;
+                    blockData[row + 6] = dcval;
+                    blockData[row + 7] = dcval;
+
+                    continue;
+                }
+
+                // Even part
+                int tmp10 = p0 + p4;
+                int tmp11 = p0 - p4;
+
+                int tmp13 = p2 + p6;
+                int tmp12 = Multiply(p2 - p6, FIX_1_414213562) - tmp13; /* 2*c4 */
+
+                int tmp0 = tmp10 + tmp13;
+                int tmp3 = tmp10 - tmp13;
+                int tmp1 = tmp11 + tmp12;
+                int tmp2 = tmp11 - tmp12;
+
+                // Odd part
+                int z13 = p5 + p3;
+                int z10 = p5 - p3;
+                int z11 = p1 + p7;
+                int z12 = p1 - p7;
+
+                int tmp7 = z11 + z13;       // Phase 5
+                tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4
+
+                int z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2
+                tmp10 = Multiply(z12, FIX_1_082392200) - z5; // 2*(c2-c6)
+                tmp12 = Multiply(z10, FIX_2_613125930) + z5; // -2*(c2+c6)
+
+                int tmp6 = tmp12 - tmp7;    // Phase 2
+                int tmp5 = tmp11 - tmp6;
+                int tmp4 = tmp10 - tmp5;
+
+                // Final output stage: scale down by a factor of 8 and range-limit
+                blockData[row] = Limit[Descale(tmp0 + tmp7, ScaleBits + 3) & RangeMask];
+                blockData[row + 7] = Limit[Descale(tmp0 - tmp7, ScaleBits + 3) & RangeMask];
+                blockData[row + 1] = Limit[Descale(tmp1 + tmp6, ScaleBits + 3) & RangeMask];
+                blockData[row + 6] = Limit[Descale(tmp1 - tmp6, ScaleBits + 3) & RangeMask];
+                blockData[row + 2] = Limit[Descale(tmp2 + tmp5, ScaleBits + 3) & RangeMask];
+                blockData[row + 5] = Limit[Descale(tmp2 - tmp5, ScaleBits + 3) & RangeMask];
+                blockData[row + 3] = Limit[Descale(tmp3 + tmp4, ScaleBits + 3) & RangeMask];
+                blockData[row + 4] = Limit[Descale(tmp3 - tmp4, ScaleBits + 3) & RangeMask];
+            }
+        }
+
+        private static int Multiply(int val, int c)
+        {
+            return Descale(val * c, 8);
+        }
+
+        private static int RightShift(int x, int shft)
+        {
+            return x >> shft;
+        }
+
+        private static int Descale(int x, int n)
+        {
+            return RightShift(x + (1 << (n - 1)), n);
+        }
    }
-}
+}
--- a/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs
@ -798,7 +798,7 @@ namespace ImageSharp.Formats.Jpeg.Port
                    for (int blockCol = 0; blockCol < blocksPerLine; blockCol++)
                    {
                        int offset = GetBlockBufferOffset(ref component, blockRow, blockCol);
-                        IDCT.QuantizeAndInverse(this.quantizationTables, ref frameComponent, offset, computationBuffer);
+                        IDCT.QuantizeAndInverseAlt(this.quantizationTables, ref frameComponent, offset, computationBuffer);
                    }
                }
            }