From 656c45436490c62970f547906b1dfab22c777785 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sun, 9 Jul 2017 17:33:26 +1000 Subject: [PATCH] Reduce allocations --- .../Formats/Jpeg/Port/Components/IDCT.cs | 477 +++++++++--------- .../Formats/Jpeg/Port/JpegDecoderCore.cs | 20 +- 2 files changed, 245 insertions(+), 252 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs b/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs index 65c2bbde60..064b3bea36 100644 --- a/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Port/Components/IDCT.cs @@ -6,24 +6,38 @@ using ImageSharp.Memory; /// - /// Performa the invers + /// Performs the inverse Descrete Cosine Transform on each frame component. /// internal static class IDCT { - private const int DctCos1 = 4017; // cos(pi/16) - private const int DctSin1 = 799; // sin(pi/16) - private const int DctCos3 = 3406; // cos(3*pi/16) - private const int DctSin3 = 2276; // sin(3*pi/16) - private const int DctCos6 = 1567; // cos(6*pi/16) - private const int DctSin6 = 3784; // sin(6*pi/16) - private const int DctSqrt2 = 5793; // sqrt(2) + /// + /// Precomputed values scaled up by 14 bits + /// + public static readonly short[] Aanscales = + { + 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, 22725, 31521, 29692, 26722, 22725, 17855, + 12299, 6270, 21407, 29692, 27969, 25172, 21407, 16819, 11585, + 5906, 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315, + 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, 12873, + 17855, 16819, 15137, 12873, 10114, 6967, 3552, 8867, 12299, + 11585, 10426, 8867, 6967, 4799, 2446, 4520, 6270, 5906, 5315, + 4520, 3552, 2446, 1247 + }; + + private const int DctCos1 = 4017; // cos(pi/16) + private const int DctSin1 = 799; // sin(pi/16) + private const int DctCos3 = 3406; // cos(3*pi/16) + private const int DctSin3 = 2276; // sin(3*pi/16) + private const int DctCos6 = 1567; // cos(6*pi/16) + private const int DctSin6 = 3784; // sin(6*pi/16) + private const int DctSqrt2 = 5793; // sqrt(2) private const int DctSqrt1D2 = 2896; // sqrt(2) / 2 #pragma warning disable SA1310 // Field names must not contain underscore - private const int FIX_1_082392200 = 277; /* FIX(1.082392200) */ - private const int FIX_1_414213562 = 362; /* FIX(1.414213562) */ - private const int FIX_1_847759065 = 473; /* FIX(1.847759065) */ - private const int FIX_2_613125930 = 669; /* FIX(2.613125930) */ + private const int FIX_1_082392200 = 277; // FIX(1.082392200) + private const int FIX_1_414213562 = 362; // FIX(1.414213562) + private const int FIX_1_847759065 = 473; // FIX(1.847759065) + private const int FIX_2_613125930 = 669; // FIX(2.613125930) #pragma warning restore SA1310 // Field names must not contain underscore private const int ConstBits = 8; @@ -42,21 +56,9 @@ // be quite far out of range if the input data is corrupt, so a bulletproof // range-limiting step is required. We use a mask-and-table-lookup method // to do the combined operations quickly, assuming that MaxJSample+1 - // is a power of 2. See the comments with prepare_range_limit_table for more info. + // is a power of 2. private const int RangeMask = (MaxJSample * 4) + 3; // 2 bits wider than legal samples - // Precomputed values scaled up by 14 bits - private static readonly short[] Aanscales = - { - 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, 22725, 31521, 29692, 26722, 22725, 17855, - 12299, 6270, 21407, 29692, 27969, 25172, 21407, 16819, 11585, - 5906, 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315, - 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, 12873, - 17855, 16819, 15137, 12873, 10114, 6967, 3552, 8867, 12299, - 11585, 10426, 8867, 6967, 4799, 2446, 4520, 6270, 5906, 5315, - 4520, 3552, 2446, 1247 - }; - private static readonly byte[] Limit = new byte[5 * (MaxJSample + 1)]; static IDCT() @@ -81,15 +83,13 @@ /// 'Practical Fast 1-D DCT Algorithms with 11 Multiplications', /// IEEE Intl. Conf. on Acoustics, Speech & Signal Processing, 1989, 988-991. /// - /// The quantization tables /// The fram component /// The block buffer offset /// The computational buffer for holding temp values - public static void QuantizeAndInverse(QuantizationTables quantizationTables, ref FrameComponent component, int blockBufferOffset, Buffer computationBuffer) + /// The quantization table + public static void QuantizeAndInverse(ref FrameComponent component, int blockBufferOffset, ref Span computationBuffer, ref Span quantizationTable) { - Span qt = quantizationTables.Tables.GetRowSpan(component.QuantizationIdentifier); Span blockData = component.BlockData.Slice(blockBufferOffset); - Span computationBufferSpan = computationBuffer; int v0, v1, v2, v3, v4, v5, v6, v7; int p0, p1, p2, p3, p4, p5, p6, p7; int t; @@ -108,32 +108,32 @@ p7 = blockData[row + 7]; // dequant p0 - p0 *= qt[row]; + p0 *= quantizationTable[row]; // check for all-zero AC coefficients if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0) { t = ((DctSqrt2 * p0) + 512) >> 10; short st = (short)t; - computationBufferSpan[row] = st; - computationBufferSpan[row + 1] = st; - computationBufferSpan[row + 2] = st; - computationBufferSpan[row + 3] = st; - computationBufferSpan[row + 4] = st; - computationBufferSpan[row + 5] = st; - computationBufferSpan[row + 6] = st; - computationBufferSpan[row + 7] = st; + computationBuffer[row] = st; + computationBuffer[row + 1] = st; + computationBuffer[row + 2] = st; + computationBuffer[row + 3] = st; + computationBuffer[row + 4] = st; + computationBuffer[row + 5] = st; + computationBuffer[row + 6] = st; + computationBuffer[row + 7] = st; continue; } // dequant p1 ... p7 - p1 *= qt[row + 1]; - p2 *= qt[row + 2]; - p3 *= qt[row + 3]; - p4 *= qt[row + 4]; - p5 *= qt[row + 5]; - p6 *= qt[row + 6]; - p7 *= qt[row + 7]; + p1 *= quantizationTable[row + 1]; + p2 *= quantizationTable[row + 2]; + p3 *= quantizationTable[row + 3]; + p4 *= quantizationTable[row + 4]; + p5 *= quantizationTable[row + 5]; + p6 *= quantizationTable[row + 6]; + p7 *= quantizationTable[row + 7]; // stage 4 v0 = ((DctSqrt2 * p0) + 128) >> 8; @@ -169,27 +169,27 @@ v6 = t; // stage 1 - computationBufferSpan[row] = (short)(v0 + v7); - computationBufferSpan[row + 7] = (short)(v0 - v7); - computationBufferSpan[row + 1] = (short)(v1 + v6); - computationBufferSpan[row + 6] = (short)(v1 - v6); - computationBufferSpan[row + 2] = (short)(v2 + v5); - computationBufferSpan[row + 5] = (short)(v2 - v5); - computationBufferSpan[row + 3] = (short)(v3 + v4); - computationBufferSpan[row + 4] = (short)(v3 - v4); + computationBuffer[row] = (short)(v0 + v7); + computationBuffer[row + 7] = (short)(v0 - v7); + computationBuffer[row + 1] = (short)(v1 + v6); + computationBuffer[row + 6] = (short)(v1 - v6); + computationBuffer[row + 2] = (short)(v2 + v5); + computationBuffer[row + 5] = (short)(v2 - v5); + computationBuffer[row + 3] = (short)(v3 + v4); + computationBuffer[row + 4] = (short)(v3 - v4); } // inverse DCT on columns for (int col = 0; col < 8; ++col) { - p0 = computationBufferSpan[col]; - p1 = computationBufferSpan[col + 8]; - p2 = computationBufferSpan[col + 16]; - p3 = computationBufferSpan[col + 24]; - p4 = computationBufferSpan[col + 32]; - p5 = computationBufferSpan[col + 40]; - p6 = computationBufferSpan[col + 48]; - p7 = computationBufferSpan[col + 56]; + p0 = computationBuffer[col]; + p1 = computationBuffer[col + 8]; + p2 = computationBuffer[col + 16]; + p3 = computationBuffer[col + 24]; + p4 = computationBuffer[col + 32]; + p5 = computationBuffer[col + 40]; + p6 = computationBuffer[col + 48]; + p7 = computationBuffer[col + 56]; // check for all-zero AC coefficients if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0) @@ -302,195 +302,188 @@ /// precise the scaled value, so this implementation does worse with high - /// quality - setting files than with low - quality ones. /// - /// The quantization tables - /// The fram component + /// The frame component /// The block buffer offset /// The computational buffer for holding temp values - public static void QuantizeAndInverseAlt( - QuantizationTables quantizationTables, - ref FrameComponent component, - int blockBufferOffset, - Buffer computationBuffer) + /// The multiplier table + public static void QuantizeAndInverseFast(ref FrameComponent component, int blockBufferOffset, ref Span computationBuffer, ref Span multiplierTable) { - Span qt = quantizationTables.Tables.GetRowSpan(component.QuantizationIdentifier); Span blockData = component.BlockData.Slice(blockBufferOffset); - Span computationBufferSpan = computationBuffer; - - // For AA&N IDCT method, multiplier are equal to quantization - // coefficients scaled by scalefactor[row]*scalefactor[col], where - // scalefactor[0] = 1 - // scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 - // For integer operation, the multiplier table is to be scaled by 14. - using (var multiplier = new Buffer(64)) + int p0, p1, p2, p3, p4, p5, p6, p7; + + for (int col = 0; col < 8; col++) { - Span multiplierSpan = multiplier; - for (int i = 0; i < 64; i++) + // Gather block data + p0 = blockData[col]; + p1 = blockData[col + 8]; + p2 = blockData[col + 16]; + p3 = blockData[col + 24]; + p4 = blockData[col + 32]; + p5 = blockData[col + 40]; + p6 = blockData[col + 48]; + p7 = blockData[col + 56]; + + int tmp0 = p0 * multiplierTable[col]; + + // Due to quantization, we will usually find that many of the input + // coefficients are zero, especially the AC terms. We can exploit this + // by short-circuiting the IDCT calculation for any column in which all + // the AC terms are zero. In that case each output is equal to the + // DC coefficient (with scale factor as needed). + // With typical images and quantization tables, half or more of the + // column DCT calculations can be simplified this way. + if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0) { - multiplierSpan[i] = (short)Descale(qt[i] * Aanscales[i], 14 - Pass1Bits); - } + short dcval = (short)tmp0; - int p0, p1, p2, p3, p4, p5, p6, p7; + computationBuffer[col] = dcval; + computationBuffer[col + 8] = dcval; + computationBuffer[col + 16] = dcval; + computationBuffer[col + 24] = dcval; + computationBuffer[col + 32] = dcval; + computationBuffer[col + 40] = dcval; + computationBuffer[col + 48] = dcval; + computationBuffer[col + 56] = dcval; - for (int col = 0; col < 8; col++) - { - // Gather block data - p0 = blockData[col]; - p1 = blockData[col + 8]; - p2 = blockData[col + 16]; - p3 = blockData[col + 24]; - p4 = blockData[col + 32]; - p5 = blockData[col + 40]; - p6 = blockData[col + 48]; - p7 = blockData[col + 56]; - - int tmp0 = p0 * multiplierSpan[col]; - - // Due to quantization, we will usually find that many of the input - // coefficients are zero, especially the AC terms. We can exploit this - // by short-circuiting the IDCT calculation for any column in which all - // the AC terms are zero. In that case each output is equal to the - // DC coefficient (with scale factor as needed). - // With typical images and quantization tables, half or more of the - // column DCT calculations can be simplified this way. - if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0) - { - short dcval = (short)tmp0; - - computationBufferSpan[col] = dcval; - computationBufferSpan[col + 8] = dcval; - computationBufferSpan[col + 16] = dcval; - computationBufferSpan[col + 24] = dcval; - computationBufferSpan[col + 32] = dcval; - computationBufferSpan[col + 40] = dcval; - computationBufferSpan[col + 48] = dcval; - computationBufferSpan[col + 56] = dcval; - - continue; - } - - // Even part - int tmp1 = p2 * multiplierSpan[col + 16]; - int tmp2 = p4 * multiplierSpan[col + 32]; - int tmp3 = p6 * multiplierSpan[col + 48]; - - int tmp10 = tmp0 + tmp2; // Phase 3 - int tmp11 = tmp0 - tmp2; - - int tmp13 = tmp1 + tmp3; // Phases 5-3 - int tmp12 = Multiply(tmp1 - tmp3, FIX_1_414213562) - tmp13; // 2*c4 - - tmp0 = tmp10 + tmp13; // Phase 2 - tmp3 = tmp10 - tmp13; - tmp1 = tmp11 + tmp12; - tmp2 = tmp11 - tmp12; - - // Odd Part - int tmp4 = p1 * multiplierSpan[col + 8]; - int tmp5 = p3 * multiplierSpan[col + 24]; - int tmp6 = p5 * multiplierSpan[col + 40]; - int tmp7 = p7 * multiplierSpan[col + 56]; - - int z13 = tmp6 + tmp5; // Phase 6 - int z10 = tmp6 - tmp5; - int z11 = tmp4 + tmp7; - int z12 = tmp4 - tmp7; - - tmp7 = z11 + z13; // Phase 5 - tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4 - - int z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2 - tmp10 = z5 - Multiply(z12, FIX_1_082392200); // 2*(c2-c6) - tmp12 = z5 - Multiply(z10, FIX_2_613125930); // 2*(c2+c6) - - tmp6 = tmp12 - tmp7; // Phase 2 - tmp5 = tmp11 - tmp6; - tmp4 = tmp10 - tmp5; - - computationBufferSpan[col] = (short)(tmp0 + tmp7); - computationBufferSpan[col + 56] = (short)(tmp0 - tmp7); - computationBufferSpan[col + 8] = (short)(tmp1 + tmp6); - computationBufferSpan[col + 48] = (short)(tmp1 - tmp6); - computationBufferSpan[col + 16] = (short)(tmp2 + tmp5); - computationBufferSpan[col + 40] = (short)(tmp2 - tmp5); - computationBufferSpan[col + 24] = (short)(tmp3 + tmp4); - computationBufferSpan[col + 32] = (short)(tmp3 - tmp4); + continue; } - // Pass 2: process rows from work array, store into output array. - // Note that we must descale the results by a factor of 8 == 2**3, - // and also undo the pass 1 bits scaling. - for (int row = 0; row < 64; row += 8) + // Even part + int tmp1 = p2 * multiplierTable[col + 16]; + int tmp2 = p4 * multiplierTable[col + 32]; + int tmp3 = p6 * multiplierTable[col + 48]; + + int tmp10 = tmp0 + tmp2; // Phase 3 + int tmp11 = tmp0 - tmp2; + + int tmp13 = tmp1 + tmp3; // Phases 5-3 + int tmp12 = Multiply(tmp1 - tmp3, FIX_1_414213562) - tmp13; // 2*c4 + + tmp0 = tmp10 + tmp13; // Phase 2 + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + // Odd Part + int tmp4 = p1 * multiplierTable[col + 8]; + int tmp5 = p3 * multiplierTable[col + 24]; + int tmp6 = p5 * multiplierTable[col + 40]; + int tmp7 = p7 * multiplierTable[col + 56]; + + int z13 = tmp6 + tmp5; // Phase 6 + int z10 = tmp6 - tmp5; + int z11 = tmp4 + tmp7; + int z12 = tmp4 - tmp7; + + tmp7 = z11 + z13; // Phase 5 + tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4 + + int z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2 + tmp10 = z5 - Multiply(z12, FIX_1_082392200); // 2*(c2-c6) + tmp12 = z5 - Multiply(z10, FIX_2_613125930); // 2*(c2+c6) + + tmp6 = tmp12 - tmp7; // Phase 2 + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 - tmp5; + + computationBuffer[col] = (short)(tmp0 + tmp7); + computationBuffer[col + 56] = (short)(tmp0 - tmp7); + computationBuffer[col + 8] = (short)(tmp1 + tmp6); + computationBuffer[col + 48] = (short)(tmp1 - tmp6); + computationBuffer[col + 16] = (short)(tmp2 + tmp5); + computationBuffer[col + 40] = (short)(tmp2 - tmp5); + computationBuffer[col + 24] = (short)(tmp3 + tmp4); + computationBuffer[col + 32] = (short)(tmp3 - tmp4); + } + + // Pass 2: process rows from work array, store into output array. + // Note that we must descale the results by a factor of 8 == 2**3, + // and also undo the pass 1 bits scaling. + for (int row = 0; row < 64; row += 8) + { + p1 = computationBuffer[row + 1]; + p2 = computationBuffer[row + 2]; + p3 = computationBuffer[row + 3]; + p4 = computationBuffer[row + 4]; + p5 = computationBuffer[row + 5]; + p6 = computationBuffer[row + 6]; + p7 = computationBuffer[row + 7]; + + // Add range center and fudge factor for final descale and range-limit. + int z5 = computationBuffer[row] + (RangeCenter << (Pass1Bits + 3)) + (1 << (Pass1Bits + 2)); + + // Check for all-zero AC coefficients + if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0) { - p1 = computationBufferSpan[row + 1]; - p2 = computationBufferSpan[row + 2]; - p3 = computationBufferSpan[row + 3]; - p4 = computationBufferSpan[row + 4]; - p5 = computationBufferSpan[row + 5]; - p6 = computationBufferSpan[row + 6]; - p7 = computationBufferSpan[row + 7]; - - // Add range center and fudge factor for final descale and range-limit. - int z5 = computationBufferSpan[row] + (RangeCenter << (Pass1Bits + 3)) + (1 << (Pass1Bits + 2)); - - // Check for all-zero AC coefficients - if ((p1 | p2 | p3 | p4 | p5 | p6 | p7) == 0) - { - byte dcval = Limit[LimitOffset + (RightShift(z5, Pass1Bits + 3) & RangeMask)]; - - blockData[row] = dcval; - blockData[row + 1] = dcval; - blockData[row + 2] = dcval; - blockData[row + 3] = dcval; - blockData[row + 4] = dcval; - blockData[row + 5] = dcval; - blockData[row + 6] = dcval; - blockData[row + 7] = dcval; - - continue; - } - - // Even part - int tmp10 = z5 + p4; - int tmp11 = z5 - p4; - - int tmp13 = p2 + p6; - int tmp12 = Multiply(p2 - p6, FIX_1_414213562) - tmp13; // 2*c4 - - int tmp0 = tmp10 + tmp13; - int tmp3 = tmp10 - tmp13; - int tmp1 = tmp11 + tmp12; - int tmp2 = tmp11 - tmp12; - - // Odd part - int z13 = p5 + p3; - int z10 = p5 - p3; - int z11 = p1 + p7; - int z12 = p1 - p7; - - int tmp7 = z11 + z13; // Phase 5 - tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4 - - z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2 - tmp10 = z5 - Multiply(z12, FIX_1_082392200); // 2*(c2-c6) - tmp12 = z5 - Multiply(z10, FIX_2_613125930); // 2*(c2+c6) - - int tmp6 = tmp12 - tmp7; // Phase 2 - int tmp5 = tmp11 - tmp6; - int tmp4 = tmp10 - tmp5; - - // Final output stage: scale down by a factor of 8, offset, and range-limit - blockData[row] = Limit[LimitOffset + (RightShift(tmp0 + tmp7, Pass1Bits + 3) & RangeMask)]; - blockData[row + 7] = Limit[LimitOffset + (RightShift(tmp0 - tmp7, Pass1Bits + 3) & RangeMask)]; - blockData[row + 1] = Limit[LimitOffset + (RightShift(tmp1 + tmp6, Pass1Bits + 3) & RangeMask)]; - blockData[row + 6] = Limit[LimitOffset + (RightShift(tmp1 - tmp6, Pass1Bits + 3) & RangeMask)]; - blockData[row + 2] = Limit[LimitOffset + (RightShift(tmp2 + tmp5, Pass1Bits + 3) & RangeMask)]; - blockData[row + 5] = Limit[LimitOffset + (RightShift(tmp2 - tmp5, Pass1Bits + 3) & RangeMask)]; - blockData[row + 3] = Limit[LimitOffset + (RightShift(tmp3 + tmp4, Pass1Bits + 3) & RangeMask)]; - blockData[row + 4] = Limit[LimitOffset + (RightShift(tmp3 - tmp4, Pass1Bits + 3) & RangeMask)]; + byte dcval = Limit[LimitOffset + (RightShift(z5, Pass1Bits + 3) & RangeMask)]; + + blockData[row] = dcval; + blockData[row + 1] = dcval; + blockData[row + 2] = dcval; + blockData[row + 3] = dcval; + blockData[row + 4] = dcval; + blockData[row + 5] = dcval; + blockData[row + 6] = dcval; + blockData[row + 7] = dcval; + + continue; } + + // Even part + int tmp10 = z5 + p4; + int tmp11 = z5 - p4; + + int tmp13 = p2 + p6; + int tmp12 = Multiply(p2 - p6, FIX_1_414213562) - tmp13; // 2*c4 + + int tmp0 = tmp10 + tmp13; + int tmp3 = tmp10 - tmp13; + int tmp1 = tmp11 + tmp12; + int tmp2 = tmp11 - tmp12; + + // Odd part + int z13 = p5 + p3; + int z10 = p5 - p3; + int z11 = p1 + p7; + int z12 = p1 - p7; + + int tmp7 = z11 + z13; // Phase 5 + tmp11 = Multiply(z11 - z13, FIX_1_414213562); // 2*c4 + + z5 = Multiply(z10 + z12, FIX_1_847759065); // 2*c2 + tmp10 = z5 - Multiply(z12, FIX_1_082392200); // 2*(c2-c6) + tmp12 = z5 - Multiply(z10, FIX_2_613125930); // 2*(c2+c6) + + int tmp6 = tmp12 - tmp7; // Phase 2 + int tmp5 = tmp11 - tmp6; + int tmp4 = tmp10 - tmp5; + + // Final output stage: scale down by a factor of 8, offset, and range-limit + blockData[row] = Limit[LimitOffset + (RightShift(tmp0 + tmp7, Pass1Bits + 3) & RangeMask)]; + blockData[row + 7] = Limit[LimitOffset + (RightShift(tmp0 - tmp7, Pass1Bits + 3) & RangeMask)]; + blockData[row + 1] = Limit[LimitOffset + (RightShift(tmp1 + tmp6, Pass1Bits + 3) & RangeMask)]; + blockData[row + 6] = Limit[LimitOffset + (RightShift(tmp1 - tmp6, Pass1Bits + 3) & RangeMask)]; + blockData[row + 2] = Limit[LimitOffset + (RightShift(tmp2 + tmp5, Pass1Bits + 3) & RangeMask)]; + blockData[row + 5] = Limit[LimitOffset + (RightShift(tmp2 - tmp5, Pass1Bits + 3) & RangeMask)]; + blockData[row + 3] = Limit[LimitOffset + (RightShift(tmp3 + tmp4, Pass1Bits + 3) & RangeMask)]; + blockData[row + 4] = Limit[LimitOffset + (RightShift(tmp3 - tmp4, Pass1Bits + 3) & RangeMask)]; } } + /// + /// Descale and correctly round an int value that's scaled by bits. + /// We assume rounds towards minus infinity, so adding + /// the fudge factor is correct for either sign of . + /// + /// The value + /// The number of bits + /// The + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int Descale(int value, int n) + { + return RightShift(value + (1 << (n - 1)), n); + } + /// /// Multiply a variable by an int constant, and immediately descale. /// @@ -514,19 +507,5 @@ { return value >> shift; } - - /// - /// Descale and correctly round an int value that's scaled by bits. - /// We assume rounds towards minus infinity, so adding - /// the fudge factor is correct for either sign of . - /// - /// The value - /// The number of bits - /// The - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int Descale(int value, int n) - { - return RightShift(value + (1 << (n - 1)), n); - } } } \ No newline at end of file diff --git a/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs index ef49dfaf06..074ee3cfdc 100644 --- a/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/Port/JpegDecoderCore.cs @@ -792,13 +792,28 @@ namespace ImageSharp.Formats.Jpeg.Port int blocksPerLine = component.BlocksPerLine; int blocksPerColumn = component.BlocksPerColumn; using (var computationBuffer = Buffer.CreateClean(64)) - { + using (var multiplicationBuffer = Buffer.CreateClean(64)) + { + Span quantizationTable = this.quantizationTables.Tables.GetRowSpan(frameComponent.QuantizationIdentifier); + Span computationBufferSpan = computationBuffer; + + // For AA&N IDCT method, multiplier are equal to quantization + // coefficients scaled by scalefactor[row]*scalefactor[col], where + // scalefactor[0] = 1 + // scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + // For integer operation, the multiplier table is to be scaled by 12. + Span multiplierSpan = multiplicationBuffer; + for (int i = 0; i < 64; i++) + { + multiplierSpan[i] = (short)IDCT.Descale(quantizationTable[i] * IDCT.Aanscales[i], 12); + } + for (int blockRow = 0; blockRow < blocksPerColumn; blockRow++) { for (int blockCol = 0; blockCol < blocksPerLine; blockCol++) { int offset = GetBlockBufferOffset(ref component, blockRow, blockCol); - IDCT.QuantizeAndInverseAlt(this.quantizationTables, ref frameComponent, offset, computationBuffer); + IDCT.QuantizeAndInverseFast(ref frameComponent, offset, ref computationBufferSpan, ref multiplierSpan); } } } @@ -808,7 +823,6 @@ namespace ImageSharp.Formats.Jpeg.Port /// /// Builds the huffman tables - /// TODO: This is our bottleneck. We should use a faster algorithm with a LUT. /// /// The tables /// The table index