From cc99da35bf20804ae57000e15bb75b4c330a8679 Mon Sep 17 00:00:00 2001
From: Dmitry Pentin <riotbr3aker@gmail.com>
Date: Sun, 29 Aug 2021 05:35:58 +0300
Subject: [PATCH] Added DCT in place

---
 .../Decoder/JpegBlockPostProcessor.cs         | 24 ++++------
 .../Components/Encoder/HuffmanScanEncoder.cs  | 22 +++++-----
 .../Jpeg/Components/FastFloatingPointDCT.cs   | 44 +++++++++++++++----
 .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs  |  2 +-
 4 files changed, 57 insertions(+), 35 deletions(-)
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
index 00169d082..cf5fdd2df 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@@ -19,14 +19,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
         public Block8x8F SourceBlock;
 
         /// <summary>
-        /// Temporal block 1 to store intermediate and/or final computation results.
+        /// Temporal block to store intermediate computation results.
         /// </summary>
-        public Block8x8F WorkspaceBlock1;
-
-        /// <summary>
-        /// Temporal block 2 to store intermediate and/or final computation results.
-        /// </summary>
-        public Block8x8F WorkspaceBlock2;
+        public Block8x8F WorkspaceBlock;
 
         /// <summary>
         /// The quantization table as <see cref="Block8x8F"/>.
@@ -50,8 +45,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
             this.subSamplingDivisors = component.SubSamplingDivisors;
 
             this.SourceBlock = default;
-            this.WorkspaceBlock1 = default;
-            this.WorkspaceBlock2 = default;
+            this.WorkspaceBlock = default;
         }
 
         /// <summary>
@@ -71,20 +65,20 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
             int destAreaStride,
             float maximumValue)
         {
-            ref Block8x8F b = ref this.SourceBlock;
-            b.LoadFrom(ref sourceBlock);
+            ref Block8x8F block = ref this.SourceBlock;
+            block.LoadFrom(ref sourceBlock);
 
             // Dequantize:
-            b.MultiplyInPlace(ref this.DequantiazationTable);
+            block.MultiplyInPlace(ref this.DequantiazationTable);
 
-            FastFloatingPointDCT.TransformIDCT(ref b, ref this.WorkspaceBlock1, ref this.WorkspaceBlock2);
+            FastFloatingPointDCT.TransformInplaceIDCT(ref block, ref this.WorkspaceBlock);
 
             // To conform better to libjpeg we actually NEED TO loose precision here.
             // This is because they store blocks as Int16 between all the operations.
             // To be "more accurate", we need to emulate this by rounding!
-            this.WorkspaceBlock1.NormalizeColorsAndRoundInPlace(maximumValue);
+            block.NormalizeColorsAndRoundInPlace(maximumValue);
 
-            this.WorkspaceBlock1.ScaledCopyTo(
+            block.ScaledCopyTo(
                 ref destAreaOrigin,
                 destAreaStride,
                 this.subSamplingDivisors.Width,
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 8b61b66c9..4f5ffb3f8 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -94,8 +94,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         private int bitCount;
 
-        private Block8x8F temporalBlock1;
-        private Block8x8F temporalBlock2;
+        private Block8x8F temporalBlock;
         private Block8x8 temporalShortBlock;
 
         /// <summary>
@@ -299,23 +298,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
         /// </summary>
         /// <param name="index">The quantization table index.</param>
         /// <param name="prevDC">The previous DC value.</param>
-        /// <param name="src">Source block</param>
-        /// <param name="quant">Quantization table</param>
-        /// <param name="unZig">The 8x8 Unzig block.</param>
+        /// <param name="block">Source block.</param>
+        /// <param name="quant">Quantization table.</param>
         /// <returns>The <see cref="int"/>.</returns>
         private int WriteBlock(
             QuantIndex index,
             int prevDC,
-            ref Block8x8F src,
+            ref Block8x8F block,
             ref Block8x8F quant)
         {
-            ref Block8x8F refTemp1 = ref this.temporalBlock1;
-            ref Block8x8F refTemp2 = ref this.temporalBlock2;
+            ref Block8x8F refTemp = ref this.temporalBlock;
             ref Block8x8 spectralBlock = ref this.temporalShortBlock;
 
-            FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2);
+            // Shifting level from 0..255 to -128..127
+            block.AddInPlace(-128f);
 
-            Block8x8F.Quantize(ref refTemp1, ref spectralBlock, ref quant);
+            // Discrete cosine transform
+            FastFloatingPointDCT.TransformInplaceFDCT(ref block, ref refTemp);
+
+            // Quantization
+            Block8x8F.Quantize(ref block, ref spectralBlock, ref quant);
 
             // Emit the DC delta.
             int dc = spectralBlock[0];
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 0f569b5da..dd46a83e3 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -276,28 +276,36 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
         /// <param name="src">Source</param>
         /// <param name="dest">Destination</param>
         /// <param name="temp">Temporary block provided by the caller for optimization</param>
-        /// <param name="offsetSourceByNeg128">If true, a constant -128.0 offset is applied for all values before FDCT </param>
         public static void TransformFDCT(
             ref Block8x8F src,
             ref Block8x8F dest,
-            ref Block8x8F temp,
-            bool offsetSourceByNeg128 = true)
+            ref Block8x8F temp)
         {
             src.TransposeInto(ref temp);
-            if (offsetSourceByNeg128)
-            {
-                temp.AddInPlace(-128F);
-            }
-
             FDCT8x8(ref temp, ref dest);
 
             dest.TransposeInto(ref temp);
-
             FDCT8x8(ref temp, ref dest);
 
             dest.MultiplyInPlace(C_0_125);
         }
 
+        /// <summary>
+        /// Apply floating point FDCT inplace.
+        /// </summary>
+        /// <param name="matrix">Input matrix.</param>
+        /// <param name="temp">Matrix to store temporal results.</param>
+        public static void TransformInplaceFDCT(ref Block8x8F matrix, ref Block8x8F temp)
+        {
+            matrix.TransposeInto(ref temp);
+            FDCT8x8(ref temp, ref matrix);
+
+            matrix.TransposeInto(ref temp);
+            FDCT8x8(ref temp, ref matrix);
+
+            matrix.MultiplyInPlace(C_0_125);
+        }
+
         /// <summary>
         /// Performs 8x8 matrix Inverse Discrete Cosine Transform
         /// </summary>
@@ -510,5 +518,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
             // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
             dest.MultiplyInPlace(C_0_125);
         }
+
+        /// <summary>
+        /// Apply floating point IDCT inplace.
+        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
+        /// </summary>
+        /// <param name="matrix">Input matrix.</param>
+        /// <param name="temp">Matrix to store temporal results.</param>
+        public static void TransformInplaceIDCT(ref Block8x8F block, ref Block8x8F temp)
+        {
+            block.TransposeInto(ref temp);
+
+            IDCT8x8(ref temp, ref block);
+            block.TransposeInto(ref temp);
+            IDCT8x8(ref temp, ref block);
+
+            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
+            block.MultiplyInPlace(C_0_125);
+        }
     }
 }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index d49a6498c..34ca7f9eb 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -310,7 +310,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
                     ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
 
                     // testee
-                    FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false);
+                    FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2);
 
                     var actualDest = new float[64];
                     destBlock.ScaledCopyTo(actualDest);