diff --git a/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs b/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs index f931c3d6b4..6b77dddb1f 100644 --- a/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs @@ -1,6 +1,9 @@ namespace ImageSharp.Formats.Jpeg.Port.Components { using System; + using System.Numerics; + using System.Runtime.CompilerServices; + using ImageSharp.Memory; /// @@ -17,6 +20,47 @@ private const int DctSqrt2 = 5793; // sqrt(2) private const int DctSqrt1D2 = 2896; // sqrt(2) / 2 +#pragma warning disable SA1310 // Field names must not contain underscore + private const int FIX_1_082392200 = 277; /* FIX(1.082392200) */ + private const int FIX_1_414213562 = 362; /* FIX(1.414213562) */ + private const int FIX_1_847759065 = 473; /* FIX(1.847759065) */ + private const int FIX_2_613125930 = 669; /* FIX(2.613125930) */ +#pragma warning restore SA1310 // Field names must not contain underscore + + private const int ScaleBits = 2; /* fractional bits in scale factors */ + + /* + * Each IDCT routine is responsible for range-limiting its results and + * converting them to unsigned form (0..255). The raw outputs could + * be quite far out of range if the input data is corrupt, so a bulletproof + * range-limiting step is required. We use a mask-and-table-lookup method + * to do the combined operations quickly, assuming that 255+1 + * is a power of 2. See the comments with prepare_range_limit_table for more info. + */ + private const int RangeMask = (255 * 4) + 3; /* 2 bits wider than legal samples */ + + private static readonly byte[] Limit = new byte[5 * (255 + 1)]; + + static IDCT() + { + // First segment of range limit table: limit[x] = 0 for x < 0 + // allow negative subscripts of simple table */ + int tableOffset = 2 * (255 + 1); + + // Main part of range limit table: limit[x] = x + int i; + for (i = 0; i <= 255; i++) + { + Limit[tableOffset + i] = (byte)i; + } + + /* End of range limit table: limit[x] = MAXJSAMPLE for x > MAXJSAMPLE */ + for (; i < 3 * (255 + 1); i++) + { + Limit[tableOffset + i] = 255; + } + } + /// /// A port of Poppler's IDCT method which in turn is taken from: /// Christoph Loeffler, Adriaan Ligtenberg, George S. Moschytz, @@ -219,5 +263,207 @@ blockData[col + 56] = (short)p7; } } + + /// + /// A port of + /// TODO: This does not work!! + /// A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT + /// on each row(or vice versa, but it's more convenient to emit a row at + /// a time). Direct algorithms are also available, but they are much more + /// complex and seem not to be any faster when reduced to code. + /// + /// This implementation is based on Arai, Agui, and Nakajima's algorithm for + /// scaled DCT.Their original paper (Trans.IEICE E-71(11):1095) is in + /// Japanese, but the algorithm is described in the Pennebaker & Mitchell + /// JPEG textbook(see REFERENCES section in file README.ijg). The following + /// code is based directly on figure 4-8 in P&M. + /// While an 8-point DCT cannot be done in less than 11 multiplies, it is + /// possible to arrange the computation so that many of the multiplies are + /// simple scalings of the final outputs.These multiplies can then be + /// folded into the multiplications or divisions by the JPEG quantization + /// table entries. The AA&N method leaves only 5 multiplies and 29 adds + /// to be done in the DCT itself. + /// The primary disadvantage of this method is that with fixed-point math, + /// accuracy is lost due to imprecise representation of the scaled + /// quantization values.The smaller the quantization table entry, the less + /// precise the scaled value, so this implementation does worse with high - + /// quality - setting files than with low - quality ones. + /// + /// The quantization tables + /// The fram component + /// The block buffer offset + /// The computational buffer for holding temp values + public static void QuantizeAndInverseAlt(QuantizationTables quantizationTables, ref FrameComponent component, int blockBufferOffset, Buffer computationBuffer) + { + Span qt = quantizationTables.Tables.GetRowSpan(component.QuantizationIdentifier); + Span blockData = component.BlockData.Slice(blockBufferOffset); + Span computationBufferSpan = computationBuffer; + + int p0, p1, p2, p3, p4, p5, p6, p7; + + for (int col = 0; col < 8; col++) + { + // Gather block data + p0 = blockData[col]; + p1 = blockData[col + 8]; + p2 = blockData[col + 16]; + p3 = blockData[col + 24]; + p4 = blockData[col + 32]; + p5 = blockData[col + 40]; + p6 = blockData[col + 48]; + p7 = blockData[col + 56]; + + int tmp0 = p0 * qt[col]; + + // Check for all-zero AC coefficients + if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0) + { + short dcval = (short)tmp0; + + computationBufferSpan[col] = dcval; + computationBufferSpan[col + 8] = dcval; + computationBufferSpan[col + 16] = dcval; + computationBufferSpan[col + 24] = dcval; + computationBufferSpan[col + 32] = dcval; + computationBufferSpan[col + 40] = dcval; + computationBufferSpan[col + 48] = dcval; + computationBufferSpan[col + 56] = dcval; + continue; + } + + // Even part + int tmp1 = p2 * qt[col + 16]; + int tmp2 = p4 * qt[col + 32]; + int tmp3 = p6 * qt[col + 48]; + + int tmp10 = tmp0 + tmp2; // Phase 3 + int tmp11 = tmp0 - tmp2; + + int tmp13 = tmp1 + tmp3; // Phases 5-3 + int tmp12 = Multiply(tmp1 - tmp3, FIX_1_414213562) - tmp13; // 2*c4 + + tmp0 = tmp10 + tmp13; // Phase 2 + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + // Odd Part + int tmp4 = p1 * qt[col + 8]; + int tmp5 = p3 * qt[col + 24]; + int tmp6 = p5 * qt[col + 40]; + int tmp7 = p7 * qt[col + 56]; + + int z13 = tmp6 + tmp5; // Phase 6 + int z10 = tmp6 - tmp5; + int z11 = tmp4 + tmp7; + int z12 = tmp4 - tmp7; + + tmp7 = z11 + z13; // Phase 5 + tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4 + + int z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2 + tmp10 = Multiply(z12, FIX_1_082392200) - z5; // 2*(c2-c6) + tmp12 = Multiply(z10, FIX_2_613125930) + z5; // 2*(c2+c6) + + tmp6 = tmp12 - tmp7; // Phase 2 + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 - tmp5; + + computationBufferSpan[col] = (short)(tmp0 + tmp7); + computationBufferSpan[col + 56] = (short)(tmp0 - tmp7); + computationBufferSpan[col + 8] = (short)(tmp1 + tmp6); + computationBufferSpan[col + 48] = (short)(tmp1 - tmp6); + computationBufferSpan[col + 16] = (short)(tmp2 + tmp5); + computationBufferSpan[col + 40] = (short)(tmp2 - tmp5); + computationBufferSpan[col + 32] = (short)(tmp3 + tmp4); + computationBufferSpan[col + 24] = (short)(tmp3 - tmp4); + } + + // Pass 2: process rows from work array, store into output array. + // Note that we must descale the results by a factor of 8 == 2**3, + // and also undo the pass 1 bits scaling. + for (int row = 0; row < 64; row += 8) + { + p0 = computationBufferSpan[row]; + p1 = computationBufferSpan[row + 1]; + p2 = computationBufferSpan[row + 2]; + p3 = computationBufferSpan[row + 3]; + p4 = computationBufferSpan[row + 4]; + p5 = computationBufferSpan[row + 5]; + p6 = computationBufferSpan[row + 6]; + p7 = computationBufferSpan[row + 7]; + + // Check for all-zero AC coefficients + if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0) + { + byte dcval = Limit[Descale(p0, ScaleBits + 3) & RangeMask]; + + blockData[row] = dcval; + blockData[row + 1] = dcval; + blockData[row + 2] = dcval; + blockData[row + 3] = dcval; + blockData[row + 4] = dcval; + blockData[row + 5] = dcval; + blockData[row + 6] = dcval; + blockData[row + 7] = dcval; + + continue; + } + + // Even part + int tmp10 = p0 + p4; + int tmp11 = p0 - p4; + + int tmp13 = p2 + p6; + int tmp12 = Multiply(p2 - p6, FIX_1_414213562) - tmp13; /* 2*c4 */ + + int tmp0 = tmp10 + tmp13; + int tmp3 = tmp10 - tmp13; + int tmp1 = tmp11 + tmp12; + int tmp2 = tmp11 - tmp12; + + // Odd part + int z13 = p5 + p3; + int z10 = p5 - p3; + int z11 = p1 + p7; + int z12 = p1 - p7; + + int tmp7 = z11 + z13; // Phase 5 + tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4 + + int z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2 + tmp10 = Multiply(z12, FIX_1_082392200) - z5; // 2*(c2-c6) + tmp12 = Multiply(z10, FIX_2_613125930) + z5; // -2*(c2+c6) + + int tmp6 = tmp12 - tmp7; // Phase 2 + int tmp5 = tmp11 - tmp6; + int tmp4 = tmp10 - tmp5; + + // Final output stage: scale down by a factor of 8 and range-limit + blockData[row] = Limit[Descale(tmp0 + tmp7, ScaleBits + 3) & RangeMask]; + blockData[row + 7] = Limit[Descale(tmp0 - tmp7, ScaleBits + 3) & RangeMask]; + blockData[row + 1] = Limit[Descale(tmp1 + tmp6, ScaleBits + 3) & RangeMask]; + blockData[row + 6] = Limit[Descale(tmp1 - tmp6, ScaleBits + 3) & RangeMask]; + blockData[row + 2] = Limit[Descale(tmp2 + tmp5, ScaleBits + 3) & RangeMask]; + blockData[row + 5] = Limit[Descale(tmp2 - tmp5, ScaleBits + 3) & RangeMask]; + blockData[row + 3] = Limit[Descale(tmp3 + tmp4, ScaleBits + 3) & RangeMask]; + blockData[row + 4] = Limit[Descale(tmp3 - tmp4, ScaleBits + 3) & RangeMask]; + } + } + + private static int Multiply(int val, int c) + { + return Descale(val * c, 8); + } + + private static int RightShift(int x, int shft) + { + return x >> shft; + } + + private static int Descale(int x, int n) + { + return RightShift(x + (1 << (n - 1)), n); + } } -} +} \ No newline at end of file diff --git a/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs index d7f8a3ae78..ef49dfaf06 100644 --- a/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs @@ -798,7 +798,7 @@ namespace ImageSharp.Formats.Jpeg.Port for (int blockCol = 0; blockCol < blocksPerLine; blockCol++) { int offset = GetBlockBufferOffset(ref component, blockRow, blockCol); - IDCT.QuantizeAndInverse(this.quantizationTables, ref frameComponent, offset, computationBuffer); + IDCT.QuantizeAndInverseAlt(this.quantizationTables, ref frameComponent, offset, computationBuffer); } } }