From e83cb95cb377df22d51ec0f95cb4ebf1ce122d27 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 17 Aug 2021 12:24:37 +0300 Subject: [PATCH 01/56] Moved stuff bytes injection to outer method --- .../Components/Encoder/HuffmanScanEncoder.cs | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 331da275c..778d6ccd8 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -35,6 +35,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes]; + private readonly byte[] streamWriteBuffer = new byte[EmitBufferSizeInBytes * 2]; + + private const int BytesPerCodingUnit = 256 * 3; + /// /// Number of filled bytes in buffer /// @@ -116,6 +120,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref pixelConverter.Cr, ref chrominanceQuantTable, ref unzig); + + if (this.emitLen + BytesPerCodingUnit > EmitBufferSizeInBytes) + { + this.WriteToStream(); + } } } @@ -326,28 +335,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder byte b = (byte)(bits >> 24); this.emitBuffer[this.emitLen++] = b; - // Adding stuff byte - // This is because by JPEG standard scan data can contain JPEG markers (indicated by the 0xFF byte, followed by a non-zero byte) - // Considering this every 0xFF byte must be followed by 0x00 padding byte to signal that this is not a marker - if (b == byte.MaxValue) - { - this.emitBuffer[this.emitLen++] = byte.MinValue; - } - bits <<= 8; count -= 8; } - - // This can emit 4 times of: - // 1 byte guaranteed - // 1 extra byte.MinValue byte if previous one was byte.MaxValue - // Thus writing (1 + 1) * 4 = 8 bytes max - // So we must check if emit buffer has extra 8 bytes, if not - call stream.Write - if (this.emitLen > EmitBufferSizeInBytes - 8) - { - this.target.Write(this.emitBuffer, 0, this.emitLen); - this.emitLen = 0; - } } this.accumulatedBits = bits; @@ -520,5 +510,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder return index; } } + + [MethodImpl(InliningOptions.ShortMethod)] + private void WriteToStream() + { + int writeIdx = 0; + for (int i = 0; i < this.emitLen; i++) + { + byte value = this.emitBuffer[i]; + this.streamWriteBuffer[writeIdx++] = value; + if (value == 0xff) + { + this.streamWriteBuffer[writeIdx++] = 0x00; + } + } + + this.target.Write(this.streamWriteBuffer, 0, writeIdx); + this.emitLen = 0; + } } } From 739f5206404715ada29c62f5881cbdfb044f1232 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 17 Aug 2021 13:27:37 +0300 Subject: [PATCH 02/56] Optimized byte emition, ouput images are corrupted due to msb-lsb invalid order --- .../Components/Encoder/HuffmanScanEncoder.cs | 55 +++++++++---------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 778d6ccd8..10eda9c5a 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -1,8 +1,10 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using System.IO; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -33,7 +35,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// A buffer for reducing the number of stream writes when emitting Huffman tables. /// - private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes]; + private readonly uint[] emitBuffer = new uint[EmitBufferSizeInBytes / 4]; private readonly byte[] streamWriteBuffer = new byte[EmitBufferSizeInBytes * 2]; @@ -47,7 +49,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// Emmited bits 'micro buffer' before being transfered to the . /// - private int accumulatedBits; + private uint accumulatedBits; /// /// Number of jagged bits stored in @@ -121,7 +123,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref chrominanceQuantTable, ref unzig); - if (this.emitLen + BytesPerCodingUnit > EmitBufferSizeInBytes) + if (this.emitLen + (BytesPerCodingUnit / 4) > EmitBufferSizeInBytes / 4) { this.WriteToStream(); } @@ -320,27 +322,22 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// The packed bits. /// The number of bits [MethodImpl(InliningOptions.ShortMethod)] - private void Emit(int bits, int count) + private void Emit(uint bits, int count) { + uint correctedBits = bits << (32 - count); + + this.accumulatedBits |= correctedBits >> this.bitCount; + count += this.bitCount; - bits <<= 32 - count; - bits |= this.accumulatedBits; - // Only write if more than 8 bits. - if (count >= 8) + if (count >= 32) { - // Track length - while (count >= 8) - { - byte b = (byte)(bits >> 24); - this.emitBuffer[this.emitLen++] = b; + this.emitBuffer[this.emitLen++] = this.accumulatedBits; + this.accumulatedBits = correctedBits << (32 - this.bitCount); - bits <<= 8; - count -= 8; - } + count -= 32; } - this.accumulatedBits = bits; this.bitCount = count; } @@ -353,7 +350,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder private void EmitHuff(int[] table, int value) { int x = table[value]; - this.Emit(x >> 8, x & 0xff); + this.Emit((uint)x >> 8, x & 0xff); } [MethodImpl(InliningOptions.ShortMethod)] @@ -372,7 +369,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder this.EmitHuff(table, bt); if (bt > 0) { - this.Emit(b & ((1 << bt) - 1), bt); + this.Emit((uint)(b & ((1 << bt) - 1)), bt); } } @@ -396,7 +393,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder int bt = GetHuffmanEncodingLength((uint)a); this.EmitHuff(table, (runLength << 4) | bt); - this.Emit(b & ((1 << bt) - 1), bt); + this.Emit((uint)(b & ((1 << bt) - 1)), bt); } /// @@ -406,12 +403,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder private void FlushInternalBuffer() { // pad last byte with 1's - int padBitsCount = 8 - (this.bitCount % 8); - if (padBitsCount != 0) - { - this.Emit((1 << padBitsCount) - 1, padBitsCount); - this.target.Write(this.emitBuffer, 0, this.emitLen); - } + //int padBitsCount = 8 - (this.bitCount % 8); + //if (padBitsCount != 0) + //{ + // this.Emit((1 << padBitsCount) - 1, padBitsCount); + // this.target.Write(this.emitBuffer, 0, this.emitLen); + //} } /// @@ -514,10 +511,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder [MethodImpl(InliningOptions.ShortMethod)] private void WriteToStream() { + Span emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan()); + int writeIdx = 0; - for (int i = 0; i < this.emitLen; i++) + for (int i = 0; i < this.emitLen * 4; i++) { - byte value = this.emitBuffer[i]; + byte value = emitBytes[i]; this.streamWriteBuffer[writeIdx++] = value; if (value == 0xff) { From 8a08259e09bfc92fb4b925834807cd2b712f730b Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Wed, 18 Aug 2021 11:47:52 +0300 Subject: [PATCH 03/56] Fixed byte flush order, fixed last byte padding --- .../Components/Encoder/HuffmanScanEncoder.cs | 45 +++++++++++++++---- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 10eda9c5a..42a683539 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -41,10 +41,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder private const int BytesPerCodingUnit = 256 * 3; - /// - /// Number of filled bytes in buffer - /// - private int emitLen = 0; + private int emitWriteIndex = (EmitBufferSizeInBytes / 4); /// /// Emmited bits 'micro buffer' before being transfered to the . @@ -123,14 +120,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref chrominanceQuantTable, ref unzig); - if (this.emitLen + (BytesPerCodingUnit / 4) > EmitBufferSizeInBytes / 4) + if (this.emitWriteIndex < this.emitBuffer.Length / 2) { this.WriteToStream(); } } } - this.FlushInternalBuffer(); + this.EmitFinalBits(); } /// @@ -311,6 +308,34 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder return dc; } + [MethodImpl(InliningOptions.ShortMethod)] + private void EmitFinalBits() + { + // Bytes count we want to write to the output stream + int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8); + + // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits + uint packedBytes = (this.accumulatedBits | (uint.MaxValue >> this.bitCount)) >> ((4 - valuableBytesCount) * 8); + + // 2x size due to possible stuff bytes, max out to 8 + Span tempBuffer = stackalloc byte[valuableBytesCount * 2]; + + // Write bytes to temporal buffer + int writeCount = 0; + for (int i = 0; i < valuableBytesCount; i++) + { + byte value = (byte)(packedBytes >> (i * 8)); + tempBuffer[writeCount++] = value; + if (value == 0xff) + { + tempBuffer[writeCount++] = 0; + } + } + + // Write temporal buffer to the output stream + this.target.Write(tempBuffer, 0, writeCount); + } + /// /// Emits the least significant count of bits to the stream write buffer. /// The precondition is bits @@ -332,7 +357,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder if (count >= 32) { - this.emitBuffer[this.emitLen++] = this.accumulatedBits; + this.emitBuffer[--this.emitWriteIndex] = this.accumulatedBits; this.accumulatedBits = correctedBits << (32 - this.bitCount); count -= 32; @@ -514,7 +539,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder Span emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan()); int writeIdx = 0; - for (int i = 0; i < this.emitLen * 4; i++) + int start = emitBytes.Length - 1; + int end = (this.emitWriteIndex * 4) - 1; + for (int i = start; i > end; i--) { byte value = emitBytes[i]; this.streamWriteBuffer[writeIdx++] = value; @@ -525,7 +552,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } this.target.Write(this.streamWriteBuffer, 0, writeIdx); - this.emitLen = 0; + this.emitWriteIndex = this.emitBuffer.Length; } } } From 4c14c57d09aa9d115cf881cafef5f70ba99c035c Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Wed, 18 Aug 2021 15:11:15 +0300 Subject: [PATCH 04/56] Greatly reduced operations per emit call --- .../Jpeg/Components/Encoder/HuffmanLut.cs | 3 ++- .../Components/Encoder/HuffmanScanEncoder.cs | 23 ++++++++----------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs index ec77bf87d..f563e74e0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs @@ -4,6 +4,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { /// + /// TODO: THIS IS NO LONGER TRUE, INTERNAL REPRESENTATION WAS CHANGED AND THIS DOC SHOULD BE CHANGED TOO!!! /// A compiled look-up table representation of a huffmanSpec. /// Each value maps to a int32 of which the 24 most significant bits hold the /// codeword in bits and the 8 least significant bits hold the codeword size. @@ -54,7 +55,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder int len = i + 1; for (int j = 0; j < spec.Count[i]; j++) { - this.Values[spec.Values[k]] = len | (code << 8); + this.Values[spec.Values[k]] = len | (code << (32 - len)); code++; k++; } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 42a683539..fba814882 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -349,16 +349,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder [MethodImpl(InliningOptions.ShortMethod)] private void Emit(uint bits, int count) { - uint correctedBits = bits << (32 - count); - - this.accumulatedBits |= correctedBits >> this.bitCount; + this.accumulatedBits |= bits >> this.bitCount; count += this.bitCount; if (count >= 32) { this.emitBuffer[--this.emitWriteIndex] = this.accumulatedBits; - this.accumulatedBits = correctedBits << (32 - this.bitCount); + this.accumulatedBits = bits << (32 - this.bitCount); count -= 32; } @@ -375,7 +373,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder private void EmitHuff(int[] table, int value) { int x = table[value]; - this.Emit((uint)x >> 8, x & 0xff); + this.Emit((uint)x & 0xffff_ff00u, x & 0xff); } [MethodImpl(InliningOptions.ShortMethod)] @@ -389,13 +387,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder b = value - 1; } - int bt = GetHuffmanEncodingLength((uint)a); + int valueLen = GetHuffmanEncodingLength((uint)a); - this.EmitHuff(table, bt); - if (bt > 0) - { - this.Emit((uint)(b & ((1 << bt) - 1)), bt); - } + this.EmitHuff(table, valueLen); + this.Emit((uint)b << (32 - valueLen), valueLen); } /// @@ -415,10 +410,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder b = value - 1; } - int bt = GetHuffmanEncodingLength((uint)a); + int valueLen = GetHuffmanEncodingLength((uint)a); - this.EmitHuff(table, (runLength << 4) | bt); - this.Emit((uint)(b & ((1 << bt) - 1)), bt); + this.EmitHuff(table, (runLength << 4) | valueLen); + this.Emit((uint)b << (32 - valueLen), valueLen); } /// From c39a20326b991ed767204c61f88c15915fe24a27 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Fri, 20 Aug 2021 12:47:33 +0300 Subject: [PATCH 05/56] Merged huffman prefix & value Emit() calls --- .../Components/Encoder/HuffmanScanEncoder.cs | 31 +++++++------------ 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index fba814882..8289a4b3c 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -269,7 +269,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // Emit the DC delta. int dc = (int)refTemp2[0]; - this.EmitDirectCurrentTerm(this.huffmanTables[2 * (int)index].Values, dc - prevDC); + this.EmitHuffRLE(this.huffmanTables[2 * (int)index].Values, 0, dc - prevDC); // Emit the AC components. int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values; @@ -376,23 +376,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder this.Emit((uint)x & 0xffff_ff00u, x & 0xff); } - [MethodImpl(InliningOptions.ShortMethod)] - private void EmitDirectCurrentTerm(int[] table, int value) - { - int a = value; - int b = value; - if (a < 0) - { - a = -value; - b = value - 1; - } - - int valueLen = GetHuffmanEncodingLength((uint)a); - - this.EmitHuff(table, valueLen); - this.Emit((uint)b << (32 - valueLen), valueLen); - } - /// /// Emits a run of runLength copies of value encoded with the given Huffman encoder. /// @@ -412,8 +395,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder int valueLen = GetHuffmanEncodingLength((uint)a); - this.EmitHuff(table, (runLength << 4) | valueLen); - this.Emit((uint)b << (32 - valueLen), valueLen); + // Huffman prefix code + int huffPackage = table[(runLength << 4) | valueLen]; + int prefixLen = huffPackage & 0xff; + uint prefix = (uint)huffPackage & 0xffff_0000u; + + // Actual encoded value + uint encodedValue = (uint)b << (32 - valueLen); + + // Doing two binary shifts to get rid of leading 1's in negative value case + this.Emit(prefix | (encodedValue >> prefixLen), prefixLen + valueLen); } /// From 93044e4de00e5cad9fa776692223634742064411 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 23 Aug 2021 10:59:59 +0300 Subject: [PATCH 06/56] Sandbox code & results --- .../Program.cs | 93 +++++++++++++++++-- 1 file changed, 85 insertions(+), 8 deletions(-) diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs index e6e82b981..d4656f8be 100644 --- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs +++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs @@ -2,6 +2,9 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Diagnostics; +using System.IO; +using SixLabors.ImageSharp.Formats.Jpeg; using SixLabors.ImageSharp.Tests.Formats.Jpg; using SixLabors.ImageSharp.Tests.PixelFormats.PixelOperations; using SixLabors.ImageSharp.Tests.ProfilingBenchmarks; @@ -31,14 +34,88 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox /// public static void Main(string[] args) { - LoadResizeSaveParallelMemoryStress.Run(); - // RunJpegEncoderProfilingTests(); - // RunJpegColorProfilingTests(); - // RunDecodeJpegProfilingTests(); - // RunToVector4ProfilingTest(); - // RunResizeProfilingTest(); - - // Console.ReadLine(); + /* Master */ + // Elapsed: 5431ms across 200 iterations + // Average: 27,155ms + + /* Inserting stuff bytes later */ + // Elapsed: 5300ms across 200 iterations + // Average: 26,5ms + + /* Flush if check */ + // Elapsed: 5209ms across 200 iterations + // Average: 26,045ms + + /* [INVALID] int32 flush - invalid flush order */ + // Elapsed: 4784ms across 200 iterations + // Average: 23,92ms + + /* int32 flush - correct flush order */ + // Elapsed: 5049ms across 200 iterations + // Average: 25,245ms + + /* int32 flush - identical file output */ + // Elapsed: 4800ms across 200 iterations + // Average: 24.00ms + + /* int32 flush - optimized huffman storage & reduced instructions per Emit() */ + // Elapsed: 4680ms across 200 iterations + // Average: 23,4ms + + /* int32 flush - merged prefix & value Emit() call */ + // Elapsed: 4644ms across 200 iterations + // Average: 23,22ms + + BenchmarkEncoder("uniform_size", 200, 100); + + //ReEncodeImage("uniform_size", 100); + + Console.WriteLine("Done."); + } + + const string pathTemplate = "C:\\Users\\pl4nu\\Downloads\\{0}.jpg"; + + private static void BenchmarkEncoder(string fileName, int iterations, int quality) + { + string loadPath = String.Format(pathTemplate, fileName); + + using var saveStream = new MemoryStream(); + + var decoder = new JpegDecoder { IgnoreMetadata = true }; + using Image img = decoder.Decode(Configuration.Default, new FileStream(loadPath, FileMode.Open)); + + var encoder = new JpegEncoder() + { + Quality = quality, + ColorType = JpegColorType.YCbCr, + Subsample = JpegSubsample.Ratio444 + }; + + Stopwatch sw = new Stopwatch(); + sw.Start(); + for (int i = 0; i < iterations; i++) + { + img.SaveAsJpeg(saveStream, encoder); + saveStream.Position = 0; + } + sw.Stop(); + + Console.WriteLine($"// Elapsed: {sw.ElapsedMilliseconds}ms across {iterations} iterations\n// Average: {(double)sw.ElapsedMilliseconds / iterations}ms"); + } + + private static void ReEncodeImage(string fileName, int quality) + { + string loadPath = String.Format(pathTemplate, fileName); + using Image img = Image.Load(loadPath); + + string savePath = String.Format(pathTemplate, $"testSave_{fileName}"); + var encoder = new JpegEncoder() + { + Quality = quality, + ColorType = JpegColorType.YCbCr, + Subsample = JpegSubsample.Ratio444 + }; + img.SaveAsJpeg(savePath, encoder); } private static void RunJpegEncoderProfilingTests() From cc45eed3a1eeace81581eaba6a22f878d2bcc08d Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 23 Aug 2021 11:02:41 +0300 Subject: [PATCH 07/56] Fixed last valuable index logic --- .../Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 8289a4b3c..d8ea6bb0e 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -482,7 +482,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder for (int i = 7; i >= 0; i--) { - int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32(Unsafe.Add(ref mcuStride, i)), zero8).AsByte()); + int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref mcuStride, i)), zero8).AsByte()); // we do not know for sure if this stride contain all non-zero elements or if it has some trailing zeros if (areEqual != equalityMask) From 937a8689ba3bf5dfdd41061c82c26f2fb652442d Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 23 Aug 2021 11:30:11 +0300 Subject: [PATCH 08/56] Optimized lvi calculation via lzcnt intrinsic --- .../Components/Encoder/HuffmanScanEncoder.cs | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index d8ea6bb0e..373475f6b 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -3,6 +3,7 @@ using System; using System.IO; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS @@ -441,7 +442,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // Lzcnt would return 32 for input value of 0 - no need to check that with branching // Fallback code if Lzcnt is not supported still use if-check // But most modern CPUs support this instruction so this should not be a problem - return 32 - System.Numerics.BitOperations.LeadingZeroCount(value); + return 32 - BitOperations.LeadingZeroCount(value); #else // Ideally: // if 0 - return 0 in this case @@ -458,13 +459,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } /// - /// Returns index of the last non-zero element in given mcu block. - /// If all values of the mcu block are zero, this method might return different results depending on the runtime and hardware support. - /// This is jpeg mcu specific code, mcu[0] stores a dc value which will be encoded outside of the loop. - /// This method is guaranteed to return either -1 or 0 if all elements are zero. + /// Returns index of the last non-zero element in given matrix. /// /// - /// This is an internal operation supposed to be used only in class for jpeg encoding. + /// Returns 0 for all-zero matrix by convention. /// /// Mcu block. /// Index of the last non-zero element. @@ -484,24 +482,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref mcuStride, i)), zero8).AsByte()); - // we do not know for sure if this stride contain all non-zero elements or if it has some trailing zeros if (areEqual != equalityMask) { - // last index in the stride, we go from the end to the start of the stride - int startIndex = i * 8; - int index = startIndex + 7; - ref float elemRef = ref Unsafe.As(ref mcu); - while (index >= startIndex && (int)Unsafe.Add(ref elemRef, index) == 0) - { - index--; - } - - // this implementation will return -1 if all ac components are zero and dc are zero - return index; + // Each 4 bits represents comparison operation for each 4-byte element in input vectors + // LSB represents first element in the stride + // MSB represents last element in the stride + // lzcnt operation would calculate number of zero numbers at the end + + // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements + // So we need to invert it + int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual); + + // As input number is represented by 4 bits in the mask, we need to divide lzcnt result by 4 + // to get the exact number of zero elements in the stride + int strideRelativeIndex = 7 - (lzcnt / 4); + return (i * 8) + strideRelativeIndex; } } - return -1; + return 0; } else #endif @@ -514,7 +513,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder index--; } - // this implementation will return 0 if all ac components and dc are zero return index; } } From f9b36e794dfca1079ae517fa58af70e7b1d01e15 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 23 Aug 2021 11:30:47 +0300 Subject: [PATCH 09/56] Sandbox code & results --- tests/ImageSharp.Tests.ProfilingSandbox/Program.cs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs index d4656f8be..bdba1bef6 100644 --- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs +++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs @@ -66,6 +66,15 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox // Elapsed: 4644ms across 200 iterations // Average: 23,22ms + + /* Fixed last valuable index calculation */ + // Elapsed: 4606ms across 200 iterations + // Average: 23,03ms + + /* Intrinsic last valuable index */ + // Elapsed: 4519ms across 200 iterations + // Average: 22,595ms + BenchmarkEncoder("uniform_size", 200, 100); //ReEncodeImage("uniform_size", 100); From 787ffa57eeee862755d039c0ca672f8b1ef86aac Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 23 Aug 2021 17:04:57 +0300 Subject: [PATCH 10/56] Removed unused methods & constructor, fixed warnings --- .../Formats/Jpeg/Components/Block8x8.cs | 77 ++++++------------- .../Formats/Jpg/Block8x8FTests.cs | 4 +- .../Formats/Jpg/Block8x8Tests.cs | 38 +++------ .../Jpg/Utils/LibJpegTools.ComponentData.cs | 2 +- 4 files changed, 34 insertions(+), 87 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index bc6036903..d61a3c6fd 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -28,17 +28,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// private fixed short data[Size]; - /// - /// Initializes a new instance of the struct. - /// - /// A of coefficients - public Block8x8(Span coefficients) - { - ref byte selfRef = ref Unsafe.As(ref this); - ref byte sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(coefficients)); - Unsafe.CopyBlock(ref selfRef, ref sourceRef, Size * sizeof(short)); - } - /// /// Gets or sets a value at the given index /// @@ -75,15 +64,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components set => this[(y * 8) + x] = value; } - public static bool operator ==(Block8x8 left, Block8x8 right) - { - return left.Equals(right); - } + public static bool operator ==(Block8x8 left, Block8x8 right) => left.Equals(right); - public static bool operator !=(Block8x8 left, Block8x8 right) - { - return !left.Equals(right); - } + public static bool operator !=(Block8x8 left, Block8x8 right) => !left.Equals(right); /// /// Multiply all elements by a given @@ -149,34 +132,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components return result; } - /// - /// Pointer-based "Indexer" (getter part) - /// - /// Block pointer - /// Index - /// The scaleVec value at the specified index - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static short GetScalarAt(Block8x8* blockPtr, int idx) - { - GuardBlockIndex(idx); - - short* fp = blockPtr->data; - return fp[idx]; - } - - /// - /// Pointer-based "Indexer" (setter part) - /// - /// Block pointer - /// Index - /// Value - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void SetScalarAt(Block8x8* blockPtr, int idx, short value) + public static Block8x8 Load(Span data) { - GuardBlockIndex(idx); - - short* fp = blockPtr->data; - fp[idx] = value; + Block8x8 result = default; + result.LoadFrom(data); + return result; } /// @@ -194,7 +154,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// public short[] ToArray() { - var result = new short[Size]; + short[] result = new short[Size]; this.CopyTo(result); return result; } @@ -220,6 +180,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } } + /// + /// Load raw 16bit integers from source. + /// + /// Source + [MethodImpl(InliningOptions.ShortMethod)] + public void LoadFrom(Span source) + { + ref byte s = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref byte d = ref Unsafe.As(ref this); + + Unsafe.CopyBlock(ref d, ref s, Size * sizeof(short)); + } + /// /// Cast and copy -s from the beginning of 'source' span. /// @@ -271,16 +244,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } /// - public override bool Equals(object obj) - { - return obj is Block8x8 other && this.Equals(other); - } + public override bool Equals(object obj) => obj is Block8x8 other && this.Equals(other); /// - public override int GetHashCode() - { - return (this[0] * 31) + this[1]; - } + public override int GetHashCode() => (this[0] * 31) + this[1]; /// /// Calculate the total sum of absolute differences of elements in 'a' and 'b'. diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index c68b0ffa8..42fdd603e 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -462,7 +462,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg short[] data = Create8x8ShortData(); - var source = new Block8x8(data); + var source = Block8x8.Load(data); Block8x8F dest = default; dest.LoadFromInt16Scalar(ref source); @@ -483,7 +483,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg short[] data = Create8x8ShortData(); - var source = new Block8x8(data); + var source = Block8x8.Load(data); Block8x8F dest = default; dest.LoadFromInt16ExtendedAvx2(ref source); diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs index 9195f0915..afe71ad04 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs @@ -22,7 +22,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { short[] data = Create8x8ShortData(); - var block = new Block8x8(data); + var block = Block8x8.Load(data); for (int i = 0; i < Block8x8.Size; i++) { @@ -43,32 +43,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Assert.Equal(42, block[42]); } - [Fact] - public unsafe void Indexer_GetScalarAt_SetScalarAt() - { - int sum; - var block = default(Block8x8); - - for (int i = 0; i < Block8x8.Size; i++) - { - Block8x8.SetScalarAt(&block, i, (short)i); - } - - sum = 0; - for (int i = 0; i < Block8x8.Size; i++) - { - sum += Block8x8.GetScalarAt(&block, i); - } - - Assert.Equal(sum, 64 * 63 / 2); - } - [Fact] public void AsFloatBlock() { short[] data = Create8x8ShortData(); - var source = new Block8x8(data); + var source = Block8x8.Load(data); Block8x8F dest = source.AsFloatBlock(); @@ -82,7 +62,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg public void ToArray() { short[] data = Create8x8ShortData(); - var block = new Block8x8(data); + var block = Block8x8.Load(data); short[] result = block.ToArray(); @@ -93,8 +73,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg public void Equality_WhenTrue() { short[] data = Create8x8ShortData(); - var block1 = new Block8x8(data); - var block2 = new Block8x8(data); + var block1 = Block8x8.Load(data); + var block2 = Block8x8.Load(data); block1[0] = 42; block2[0] = 42; @@ -107,8 +87,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg public void Equality_WhenFalse() { short[] data = Create8x8ShortData(); - var block1 = new Block8x8(data); - var block2 = new Block8x8(data); + var block1 = Block8x8.Load(data); + var block2 = Block8x8.Load(data); block1[0] = 42; block2[0] = 666; @@ -131,8 +111,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg public void TotalDifference() { short[] data = Create8x8ShortData(); - var block1 = new Block8x8(data); - var block2 = new Block8x8(data); + var block1 = Block8x8.Load(data); + var block2 = Block8x8.Load(data); block2[10] += 7; block2[63] += 8; diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs index edb8d457b..560238edb 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs @@ -53,7 +53,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils { this.MinVal = Math.Min(this.MinVal, data.Min()); this.MaxVal = Math.Max(this.MaxVal, data.Max()); - this.SpectralBlocks[x, y] = new Block8x8(data); + this.SpectralBlocks[x, y] = Block8x8.Load(data); } public void LoadSpectralStride(Buffer2D data, int strideIndex) From a75d6e6e7d28747f361a90e5a06421ee8d22173b Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 24 Aug 2021 14:34:55 +0300 Subject: [PATCH 11/56] Added sse/avx vector fields to the Block8x8, small QOL fixes --- .../Formats/Jpeg/Components/Block8x8.cs | 57 ++++++++++++++----- .../Formats/Jpeg/Components/Block8x8F.cs | 13 +---- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index d61a3c6fd..79b26a042 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -2,17 +2,18 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; using System.Text; namespace SixLabors.ImageSharp.Formats.Jpeg.Components { /// - /// Represents a Jpeg block with coefficients. + /// 8x8 coefficients matrix of type. /// // ReSharper disable once InconsistentNaming + [StructLayout(LayoutKind.Explicit)] internal unsafe struct Block8x8 : IEquatable { /// @@ -20,13 +21,44 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// public const int Size = 64; +#pragma warning disable IDE0051 // Remove unused private member /// - /// A fixed size buffer holding the values. - /// See: - /// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/unsafe-code-pointers/fixed-size-buffers - /// + /// A placeholder buffer so the actual struct occupies exactly 64 * 2 bytes. /// + /// + /// This is not used directly in the code. + /// + [FieldOffset(0)] private fixed short data[Size]; +#pragma warning restore IDE0051 + +#if SUPPORTS_RUNTIME_INTRINSICS + [FieldOffset(0)] + public Vector128 V0; + [FieldOffset(16)] + public Vector128 V1; + [FieldOffset(32)] + public Vector128 V2; + [FieldOffset(48)] + public Vector128 V3; + [FieldOffset(64)] + public Vector128 V4; + [FieldOffset(80)] + public Vector128 V5; + [FieldOffset(96)] + public Vector128 V6; + [FieldOffset(112)] + public Vector128 V7; + + [FieldOffset(0)] + public Vector256 V01; + [FieldOffset(32)] + public Vector256 V23; + [FieldOffset(64)] + public Vector256 V45; + [FieldOffset(96)] + public Vector256 V67; +#endif /// /// Gets or sets a value at the given index @@ -38,7 +70,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components [MethodImpl(MethodImplOptions.AggressiveInlining)] get { - GuardBlockIndex(idx); + DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx)); + ref short selfRef = ref Unsafe.As(ref this); return Unsafe.Add(ref selfRef, idx); } @@ -46,7 +79,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components [MethodImpl(MethodImplOptions.AggressiveInlining)] set { - GuardBlockIndex(idx); + DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx)); + ref short selfRef = ref Unsafe.As(ref this); Unsafe.Add(ref selfRef, idx) = value; } @@ -204,13 +238,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } } - [Conditional("DEBUG")] - private static void GuardBlockIndex(int idx) - { - DebugGuard.MustBeLessThan(idx, Size, nameof(idx)); - DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx)); - } - /// public override string ToString() { diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index d55dfced7..a11b807bb 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -16,7 +16,7 @@ using System.Text; namespace SixLabors.ImageSharp.Formats.Jpeg.Components { /// - /// Represents a Jpeg block with coefficients. + /// 8x8 coefficients matrix of type. /// [StructLayout(LayoutKind.Explicit)] internal partial struct Block8x8F : IEquatable @@ -102,7 +102,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components [MethodImpl(MethodImplOptions.AggressiveInlining)] get { - GuardBlockIndex(idx); + DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx)); ref float selfRef = ref Unsafe.As(ref this); return Unsafe.Add(ref selfRef, idx); } @@ -110,7 +110,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components [MethodImpl(MethodImplOptions.AggressiveInlining)] set { - GuardBlockIndex(idx); + DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx)); ref float selfRef = ref Unsafe.As(ref this); Unsafe.Add(ref selfRef, idx) = value; } @@ -672,13 +672,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components return row.FastRound(); } - [Conditional("DEBUG")] - private static void GuardBlockIndex(int idx) - { - DebugGuard.MustBeLessThan(idx, Size, nameof(idx)); - DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx)); - } - /// /// Transpose the block into the destination block. /// From 2bccda8c03ec44261a563b33f1716ee8dda4ec9c Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 24 Aug 2021 15:29:08 +0300 Subject: [PATCH 12/56] 8x8 matrices small fixes --- .../Formats/Jpeg/Components/Block8x8.cs | 60 ++++++++++++++++++ .../Formats/Jpeg/Components/Block8x8F.cs | 55 +++++++++++++++++ .../Components/Encoder/HuffmanScanEncoder.cs | 61 +------------------ .../Formats/Jpg/HuffmanScanEncoderTests.cs | 10 +-- 4 files changed, 121 insertions(+), 65 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index 79b26a042..adfabc13c 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -2,9 +2,11 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; using System.Text; namespace SixLabors.ImageSharp.Formats.Jpeg.Components @@ -276,6 +278,64 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// public override int GetHashCode() => (this[0] * 31) + this[1]; + /// + /// Returns index of the last non-zero element in given matrix. + /// + /// + /// Returns 0 for all-zero matrix by convention. + /// + /// Index of the last non-zero element. + [MethodImpl(InliningOptions.ShortMethod)] + public int GetLastValuableElementIndex() + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); + + Vector256 zero8 = Vector256.Zero; + + ref Vector256 mcuStride = ref Unsafe.As>(ref this); + + for (int i = 7; i >= 0; i--) + { + int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i).AsInt32(), zero8).AsByte()); + + if (areEqual != equalityMask) + { + // Each 2 bits represents comparison operation for each 2-byte element in input vectors + // LSB represents first element in the stride + // MSB represents last element in the stride + // lzcnt operation would calculate number of zero numbers at the end + + // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements + // So we need to invert it + int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual); + + // As input number is represented by 2 bits in the mask, we need to divide lzcnt result by 2 + // to get the exact number of zero elements in the stride + int strideRelativeIndex = 7 - (lzcnt / 2); + return (i * 8) + strideRelativeIndex; + } + } + + return 0; + } + else +#endif + { + int index = Size - 1; + ref short elemRef = ref Unsafe.As(ref this); + + while (index > 0 && Unsafe.Add(ref elemRef, index) == 0) + { + index--; + } + + return index; + } + } + /// /// Calculate the total sum of absolute differences of elements in 'a' and 'b'. /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index a11b807bb..b0d7b0876 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -864,5 +864,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components return true; } } + + /// + /// Returns index of the last non-zero element in this matrix. + /// + /// Index of the last non-zero element. Returns -1 if all elements are equal to zero. + [MethodImpl(InliningOptions.ShortMethod)] + public int GetLastValuableElementIndex() + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); + + Vector256 zero8 = Vector256.Zero; + + ref Vector256 mcuStride = ref Unsafe.As>(ref this); + + for (int i = 7; i >= 0; i--) + { + int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref mcuStride, i)), zero8).AsByte()); + + if (areEqual != equalityMask) + { + // Each 4 bits represents comparison operation for each 4-byte element in input vectors + // LSB represents first element in the stride + // MSB represents last element in the stride + // lzcnt operation would calculate number of zero numbers at the end + + // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements + // So we need to invert it + int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual); + + // As input number is represented by 4 bits in the mask, we need to divide lzcnt result by 4 + // to get the exact number of zero elements in the stride + int strideRelativeIndex = 7 - (lzcnt / 4); + return (i * 8) + strideRelativeIndex; + } + } + + return -1; + } + else +#endif + { + int index = Size - 1; + ref float elemRef = ref Unsafe.As(ref this); + + while (index >= 0 && (int)Unsafe.Add(ref elemRef, index) == 0) + { + index--; + } + + return index; + } + } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 373475f6b..134b4e1cc 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -276,7 +276,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values; int runLength = 0; - int lastValuableIndex = GetLastValuableElementIndex(ref refTemp2); + int lastValuableIndex = refTemp2.GetLastValuableElementIndex(); for (int zig = 1; zig <= lastValuableIndex; zig++) { int ac = (int)refTemp2[zig]; @@ -458,65 +458,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder #endif } - /// - /// Returns index of the last non-zero element in given matrix. - /// - /// - /// Returns 0 for all-zero matrix by convention. - /// - /// Mcu block. - /// Index of the last non-zero element. - [MethodImpl(InliningOptions.ShortMethod)] - internal static int GetLastValuableElementIndex(ref Block8x8F mcu) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) - { - const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); - - Vector256 zero8 = Vector256.Zero; - - ref Vector256 mcuStride = ref mcu.V0; - - for (int i = 7; i >= 0; i--) - { - int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref mcuStride, i)), zero8).AsByte()); - - if (areEqual != equalityMask) - { - // Each 4 bits represents comparison operation for each 4-byte element in input vectors - // LSB represents first element in the stride - // MSB represents last element in the stride - // lzcnt operation would calculate number of zero numbers at the end - - // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements - // So we need to invert it - int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual); - - // As input number is represented by 4 bits in the mask, we need to divide lzcnt result by 4 - // to get the exact number of zero elements in the stride - int strideRelativeIndex = 7 - (lzcnt / 4); - return (i * 8) + strideRelativeIndex; - } - } - - return 0; - } - else -#endif - { - int index = Block8x8F.Size - 1; - ref float elemRef = ref Unsafe.As(ref mcu); - - while (index > 0 && (int)Unsafe.Add(ref elemRef, index) == 0) - { - index--; - } - - return index; - } - } - [MethodImpl(InliningOptions.ShortMethod)] private void WriteToStream() { diff --git a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs index b953e80b8..f75b0a0b8 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs @@ -95,7 +95,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expectedLessThan = 1; - int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data); + int actual = data.GetLastValuableElementIndex(); Assert.True(actual < expectedLessThan); } @@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expected = Block8x8F.Size - 1; - int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data); + int actual = data.GetLastValuableElementIndex(); Assert.Equal(expected, actual); } @@ -147,7 +147,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expected = setIndex; - int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data); + int actual = data.GetLastValuableElementIndex(); Assert.Equal(expected, actual); } @@ -182,7 +182,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expected = lastIndex; - int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data); + int actual = data.GetLastValuableElementIndex(); Assert.Equal(expected, actual); } @@ -226,7 +226,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expected = lastIndex2; - int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data); + int actual = data.GetLastValuableElementIndex(); Assert.Equal(expected, actual); } From 8098e8ef684ab43ef3bf9ff8ffb1dd0ef71ec25f Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 24 Aug 2021 21:32:13 +0300 Subject: [PATCH 13/56] Fixed last stream flush --- .../Components/Encoder/HuffmanScanEncoder.cs | 35 ++++++------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 134b4e1cc..08fe486a9 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -6,10 +6,6 @@ using System.IO; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -#if SUPPORTS_RUNTIME_INTRINSICS -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; -#endif using System.Threading; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -316,25 +312,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8); // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits - uint packedBytes = (this.accumulatedBits | (uint.MaxValue >> this.bitCount)) >> ((4 - valuableBytesCount) * 8); + uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount); - // 2x size due to possible stuff bytes, max out to 8 - Span tempBuffer = stackalloc byte[valuableBytesCount * 2]; + int writeIndex = this.emitWriteIndex; + this.emitBuffer[writeIndex - 1] = packedBytes; - // Write bytes to temporal buffer - int writeCount = 0; - for (int i = 0; i < valuableBytesCount; i++) - { - byte value = (byte)(packedBytes >> (i * 8)); - tempBuffer[writeCount++] = value; - if (value == 0xff) - { - tempBuffer[writeCount++] = 0; - } - } - - // Write temporal buffer to the output stream - this.target.Write(tempBuffer, 0, writeCount); + this.WriteToStream((writeIndex * 4) - valuableBytesCount); } /// @@ -459,14 +442,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } [MethodImpl(InliningOptions.ShortMethod)] - private void WriteToStream() + private void WriteToStream() => this.WriteToStream(this.emitWriteIndex * 4); + + [MethodImpl(InliningOptions.ShortMethod)] + private void WriteToStream(int endIndex) { Span emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan()); int writeIdx = 0; - int start = emitBytes.Length - 1; - int end = (this.emitWriteIndex * 4) - 1; - for (int i = start; i > end; i--) + int startIndex = emitBytes.Length - 1; + for (int i = startIndex; i >= endIndex; i--) { byte value = emitBytes[i]; this.streamWriteBuffer[writeIdx++] = value; From e5fec9784451a24fd36efc49d04f5637811019e1 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Wed, 25 Aug 2021 01:50:59 +0300 Subject: [PATCH 14/56] Fixed lvi --- .../Formats/Jpeg/Components/Block8x8.cs | 23 +++++++++---------- .../Formats/Jpeg/Components/Block8x8F.cs | 6 +++-- .../Components/Encoder/HuffmanScanEncoder.cs | 2 +- .../Formats/Jpg/HuffmanScanEncoderTests.cs | 20 ++++++++-------- 4 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index adfabc13c..3e5277c06 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -281,25 +281,24 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// Returns index of the last non-zero element in given matrix. /// - /// - /// Returns 0 for all-zero matrix by convention. - /// - /// Index of the last non-zero element. + /// + /// Index of the last non-zero element. Returns -1 if all elements are equal to zero. + /// [MethodImpl(InliningOptions.ShortMethod)] - public int GetLastValuableElementIndex() + public int GetLastNonZeroIndex() { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported) { const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); - Vector256 zero8 = Vector256.Zero; + Vector256 zero16 = Vector256.Zero; ref Vector256 mcuStride = ref Unsafe.As>(ref this); - for (int i = 7; i >= 0; i--) + for (int i = 3; i >= 0; i--) { - int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i).AsInt32(), zero8).AsByte()); + int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i), zero16).AsByte()); if (areEqual != equalityMask) { @@ -314,12 +313,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components // As input number is represented by 2 bits in the mask, we need to divide lzcnt result by 2 // to get the exact number of zero elements in the stride - int strideRelativeIndex = 7 - (lzcnt / 2); - return (i * 8) + strideRelativeIndex; + int strideRelativeIndex = 15 - (lzcnt / 2); + return (i * 16) + strideRelativeIndex; } } - return 0; + return -1; } else #endif @@ -327,7 +326,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components int index = Size - 1; ref short elemRef = ref Unsafe.As(ref this); - while (index > 0 && Unsafe.Add(ref elemRef, index) == 0) + while (index >= 0 && Unsafe.Add(ref elemRef, index) == 0) { index--; } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index b0d7b0876..8479cdc97 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -868,9 +868,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// Returns index of the last non-zero element in this matrix. /// - /// Index of the last non-zero element. Returns -1 if all elements are equal to zero. + /// + /// Index of the last non-zero element. Returns -1 if all elements are equal to zero. + /// [MethodImpl(InliningOptions.ShortMethod)] - public int GetLastValuableElementIndex() + public int GetLastNonZeroIndex() { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 08fe486a9..fc1146544 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -272,7 +272,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values; int runLength = 0; - int lastValuableIndex = refTemp2.GetLastValuableElementIndex(); + int lastValuableIndex = refTemp2.GetLastNonZeroIndex(); for (int zig = 1; zig <= lastValuableIndex; zig++) { int ac = (int)refTemp2[zig]; diff --git a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs index f75b0a0b8..a3aa957ee 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs @@ -87,7 +87,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg } [Fact] - public void GetLastValuableElementIndex_AllZero() + public void GetLastNonZeroIndex_AllZero() { static void RunTest() { @@ -95,7 +95,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expectedLessThan = 1; - int actual = data.GetLastValuableElementIndex(); + int actual = data.GetLastNonZeroIndex(); Assert.True(actual < expectedLessThan); } @@ -106,7 +106,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg } [Fact] - public void GetLastValuableElementIndex_AllNonZero() + public void GetLastNonZeroIndex_AllNonZero() { static void RunTest() { @@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expected = Block8x8F.Size - 1; - int actual = data.GetLastValuableElementIndex(); + int actual = data.GetLastNonZeroIndex(); Assert.Equal(expected, actual); } @@ -131,7 +131,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg [Theory] [InlineData(1)] [InlineData(2)] - public void GetLastValuableElementIndex_RandomFilledSingle(int seed) + public void GetLastNonZeroIndex_RandomFilledSingle(int seed) { static void RunTest(string seedSerialized) { @@ -147,7 +147,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expected = setIndex; - int actual = data.GetLastValuableElementIndex(); + int actual = data.GetLastNonZeroIndex(); Assert.Equal(expected, actual); } @@ -162,7 +162,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg [Theory] [InlineData(1)] [InlineData(2)] - public void GetLastValuableElementIndex_RandomFilledPartially(int seed) + public void GetLastNonZeroIndex_RandomFilledPartially(int seed) { static void RunTest(string seedSerialized) { @@ -182,7 +182,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expected = lastIndex; - int actual = data.GetLastValuableElementIndex(); + int actual = data.GetLastNonZeroIndex(); Assert.Equal(expected, actual); } @@ -197,7 +197,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg [Theory] [InlineData(1)] [InlineData(2)] - public void GetLastValuableElementIndex_RandomFilledFragmented(int seed) + public void GetLastNonZeroIndex_RandomFilledFragmented(int seed) { static void RunTest(string seedSerialized) { @@ -226,7 +226,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expected = lastIndex2; - int actual = data.GetLastValuableElementIndex(); + int actual = data.GetLastNonZeroIndex(); Assert.Equal(expected, actual); } From 81349f2358e6f1b19764928599e2ba8df796aa7f Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Wed, 25 Aug 2021 17:04:45 +0300 Subject: [PATCH 15/56] Docs, fixes, added support for other subsamples/color types --- .../Components/Encoder/HuffmanScanEncoder.cs | 123 +++++++++++++----- .../Formats/Jpeg/JpegEncoderCore.cs | 7 +- 2 files changed, 90 insertions(+), 40 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index fc1146544..a6334e2da 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -14,6 +14,51 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { internal class HuffmanScanEncoder { + /// + /// Maximum number of bytes encoded jpeg 8x8 block can occupy. + /// It's highly unlikely for block to occupy this much space - it's a theoretical limit. + /// + /// + /// Where 16 is maximum huffman code binary length according to itu + /// specs. 10 is maximum value binary length, value comes from discrete + /// cosine tranform with value range: [-1024..1023]. Block stores + /// 8x8 = 64 values thus multiplication by 64. Then divided by 8 to get + /// the number of bytes. This value is then multiplied by + /// for performance reasons. + /// + private const int MaxBytesPerBlock = (16 + 10) * 64 / 8 * MaxBytesPerBlockMultiplier; + + /// + /// Multiplier used within cache buffers size calculation. + /// + /// + /// + /// Theoretically, bytes buffer can fit + /// exactly one minimal coding unit. In reality, coding blocks occupy much + /// less space than the theoretical maximum - this can be exploited. + /// If temporal buffer size is multiplied by at least 2, second half of + /// the resulting buffer will be used as an overflow 'guard' if next + /// block would occupy maximum number of bytes. While first half may fit + /// many blocks before needing to flush. + /// + /// + /// This is subject to change. This can be equal to 1 but recomended + /// value is 2 or even greater - futher benchmarking needed. + /// + /// + private const int MaxBytesPerBlockMultiplier = 2; + + /// + /// size multiplier. + /// + /// + /// Jpeg specification requiers to insert 'stuff' bytes after each + /// 0xff byte value. Worst case scenarion is when all bytes are 0xff. + /// While it's highly unlikely (if not impossible) to get such + /// combination, it's theoretically possible so buffer size must be guarded. + /// + private const int OutputBufferLengthMultiplier = 2; + /// /// Compiled huffman tree to encode given values. /// @@ -21,24 +66,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder private HuffmanLut[] huffmanTables; /// - /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count). + /// Buffer for temporal storage of huffman rle encoding bit data. /// /// - /// This is subject to change, 1024 seems to be the best value in terms of performance. - /// expects it to be at least 8 (see comments in method body). + /// Encoding bits are assembled to 4 byte unsigned integers and then copied to this buffer. + /// This process does NOT include inserting stuff bytes. /// - private const int EmitBufferSizeInBytes = 1024; + private readonly uint[] emitBuffer; /// - /// A buffer for reducing the number of stream writes when emitting Huffman tables. + /// Buffer for temporal storage which is then written to the output stream. /// - private readonly uint[] emitBuffer = new uint[EmitBufferSizeInBytes / 4]; - - private readonly byte[] streamWriteBuffer = new byte[EmitBufferSizeInBytes * 2]; - - private const int BytesPerCodingUnit = 256 * 3; + /// + /// Encoding bits from are copied to this byte buffer including stuff bytes. + /// + private readonly byte[] streamWriteBuffer; - private int emitWriteIndex = (EmitBufferSizeInBytes / 4); + private int emitWriteIndex; /// /// Emmited bits 'micro buffer' before being transfered to the . @@ -58,11 +102,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// private readonly Stream target; - public HuffmanScanEncoder(Stream outputStream) + public HuffmanScanEncoder(int componentCount, Stream outputStream) { + int emitBufferByteLength = MaxBytesPerBlock * componentCount; + this.emitBuffer = new uint[emitBufferByteLength / sizeof(uint)]; + this.emitWriteIndex = this.emitBuffer.Length; + + this.streamWriteBuffer = new byte[emitBufferByteLength * OutputBufferLengthMultiplier]; + this.target = outputStream; } + private bool IsFlushNeeded + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => this.emitWriteIndex < this.emitBuffer.Length / 2; + } + /// /// Encodes the image with no subsampling. /// @@ -117,14 +173,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref chrominanceQuantTable, ref unzig); - if (this.emitWriteIndex < this.emitBuffer.Length / 2) + if (this.IsFlushNeeded) { - this.WriteToStream(); + this.FlushToStream(); } } } - this.EmitFinalBits(); + this.FlushRemainingBytes(); } /// @@ -190,10 +246,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref pixelConverter.Cr, ref chrominanceQuantTable, ref unzig); + + if (this.IsFlushNeeded) + { + this.FlushToStream(); + } } } - this.FlushInternalBuffer(); + this.FlushRemainingBytes(); } /// @@ -233,10 +294,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref pixelConverter.Y, ref luminanceQuantTable, ref unzig); + + if (this.IsFlushNeeded) + { + this.FlushToStream(); + } } } - this.FlushInternalBuffer(); + this.FlushRemainingBytes(); } /// @@ -306,7 +372,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } [MethodImpl(InliningOptions.ShortMethod)] - private void EmitFinalBits() + private void FlushRemainingBytes() { // Bytes count we want to write to the output stream int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8); @@ -317,7 +383,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder int writeIndex = this.emitWriteIndex; this.emitBuffer[writeIndex - 1] = packedBytes; - this.WriteToStream((writeIndex * 4) - valuableBytesCount); + this.FlushToStream((writeIndex * 4) - valuableBytesCount); } /// @@ -391,21 +457,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder this.Emit(prefix | (encodedValue >> prefixLen), prefixLen + valueLen); } - /// - /// Writes remaining bytes from internal buffer to the target stream. - /// - /// Pads last byte with 1's if necessary - private void FlushInternalBuffer() - { - // pad last byte with 1's - //int padBitsCount = 8 - (this.bitCount % 8); - //if (padBitsCount != 0) - //{ - // this.Emit((1 << padBitsCount) - 1, padBitsCount); - // this.target.Write(this.emitBuffer, 0, this.emitLen); - //} - } - /// /// Calculates how many minimum bits needed to store given value for Huffman jpeg encoding. /// @@ -442,10 +493,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } [MethodImpl(InliningOptions.ShortMethod)] - private void WriteToStream() => this.WriteToStream(this.emitWriteIndex * 4); + private void FlushToStream() => this.FlushToStream(this.emitWriteIndex * 4); [MethodImpl(InliningOptions.ShortMethod)] - private void WriteToStream(int endIndex) + private void FlushToStream(int endIndex) { Span emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan()); diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs index 88d96f554..8c6726e65 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs @@ -114,11 +114,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg this.WriteStartOfScan(image, componentCount, cancellationToken); // Write the scan compressed data. - var scanEncoder = new HuffmanScanEncoder(stream); if (this.colorType == JpegColorType.Luminance) { // luminance quantization table only - scanEncoder.EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken); + new HuffmanScanEncoder(1, stream).EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken); } else { @@ -126,10 +125,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg switch (this.subsample) { case JpegSubsample.Ratio444: - scanEncoder.Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); + new HuffmanScanEncoder(3, stream).Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); break; case JpegSubsample.Ratio420: - scanEncoder.Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); + new HuffmanScanEncoder(6, stream).Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); break; } } From 6c5cf28ecdb35b1a286b9ece0106975a35030589 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Thu, 26 Aug 2021 13:36:50 +0300 Subject: [PATCH 16/56] New zig-zag implementation --- .../Formats/Jpeg/Components/Block8x8.cs | 2 +- .../Jpeg/Components/Block8x8F.Intrinsic.cs | 87 ++++ .../Jpeg/Components/Block8x8F.ScaledCopyTo.cs | 2 +- .../Formats/Jpeg/Components/Block8x8F.cs | 138 +----- .../Components/Decoder/HuffmanScanDecoder.cs | 17 +- .../Jpeg/Components/Decoder/IRawJpegData.cs | 2 +- .../Decoder/JpegBlockPostProcessor.cs | 2 +- .../Components/Encoder/HuffmanScanEncoder.cs | 43 +- .../Formats/Jpeg/Components/Quantization.cs | 67 +-- .../Jpeg/Components/ZigZag.Intrinsic.cs | 404 ++++++++++++++++++ .../Formats/Jpeg/Components/ZigZag.cs | 79 +--- .../Formats/Jpeg/JpegDecoderCore.cs | 6 +- .../Formats/Jpeg/JpegEncoderCore.cs | 12 +- .../Formats/Jpg/Block8x8FTests.cs | 74 +--- .../Formats/Jpg/QuantizationTests.cs | 8 +- .../Jpg/Utils/ReferenceImplementations.cs | 54 +-- .../Formats/Jpg/ZigZagTests.cs | 5 +- 17 files changed, 627 insertions(+), 375 deletions(-) create mode 100644 src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs create mode 100644 src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index 3e5277c06..c76eb942f 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -12,7 +12,7 @@ using System.Text; namespace SixLabors.ImageSharp.Formats.Jpeg.Components { /// - /// 8x8 coefficients matrix of type. + /// 8x8 matrix of coefficients. /// // ReSharper disable once InconsistentNaming [StructLayout(LayoutKind.Explicit)] diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs new file mode 100644 index 000000000..073580d40 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -0,0 +1,87 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +#if SUPPORTS_RUNTIME_INTRINSICS +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components +{ + internal partial struct Block8x8F + { + /// + /// A number of rows of 8 scalar coefficients each in + /// + public const int RowCount = 8; + + [FieldOffset(0)] + public Vector256 V0; + [FieldOffset(32)] + public Vector256 V1; + [FieldOffset(64)] + public Vector256 V2; + [FieldOffset(96)] + public Vector256 V3; + [FieldOffset(128)] + public Vector256 V4; + [FieldOffset(160)] + public Vector256 V5; + [FieldOffset(192)] + public Vector256 V6; + [FieldOffset(224)] + public Vector256 V7; + + private static ReadOnlySpan DivideIntoInt16_Avx2_ShuffleMask => new int[] { + 0, 1, 4, 5, 2, 3, 6, 7 + }; + + private static unsafe void DivideIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + { + DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); + + fixed (int* maskPtr = DivideIntoInt16_Avx2_ShuffleMask) + { + Vector256 crossLaneShuffleMask = Avx.LoadVector256(maskPtr).AsInt32(); + + ref Vector256 aBase = ref Unsafe.As>(ref a); + ref Vector256 bBase = ref Unsafe.As>(ref b); + + ref Vector256 destBase = ref Unsafe.As>(ref dest); + + for (int i = 0; i < 8; i += 2) + { + Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + + Vector256 row = Avx2.PackSignedSaturate(row0, row1); + row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16(); + + Unsafe.Add(ref destBase, i / 2) = row; + } + } + } + + private static void DivideIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + { + DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!"); + + ref Vector128 aBase = ref Unsafe.As>(ref a); + ref Vector128 bBase = ref Unsafe.As>(ref b); + + ref Vector128 destBase = ref Unsafe.As>(ref dest); + + for (int i = 0; i < 16; i += 2) + { + Vector128 left = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector128 right = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + + Vector128 row = Sse2.PackSignedSaturate(left, right); + Unsafe.Add(ref destBase, i / 2) = row; + } + } + } +} +#endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs index 23cf4ce4a..498fe4d03 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs @@ -1,4 +1,4 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. using System.Numerics; diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 8479cdc97..79a35e2cd 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -16,7 +16,7 @@ using System.Text; namespace SixLabors.ImageSharp.Formats.Jpeg.Components { /// - /// 8x8 coefficients matrix of type. + /// 8x8 matrix of coefficients. /// [StructLayout(LayoutKind.Explicit)] internal partial struct Block8x8F : IEquatable @@ -66,30 +66,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components public Vector4 V7L; [FieldOffset(240)] public Vector4 V7R; - -#if SUPPORTS_RUNTIME_INTRINSICS - /// - /// A number of rows of 8 scalar coefficients each in - /// - public const int RowCount = 8; - - [FieldOffset(0)] - public Vector256 V0; - [FieldOffset(32)] - public Vector256 V1; - [FieldOffset(64)] - public Vector256 V2; - [FieldOffset(96)] - public Vector256 V3; - [FieldOffset(128)] - public Vector256 V4; - [FieldOffset(160)] - public Vector256 V5; - [FieldOffset(192)] - public Vector256 V6; - [FieldOffset(224)] - public Vector256 V7; -#endif #pragma warning restore SA1600 // ElementsMustBeDocumented /// @@ -188,13 +164,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components return result; } - /// - /// Fill the block with defaults (zeroes). - /// - [MethodImpl(InliningOptions.ShortMethod)] - public void Clear() - => this = default; // The cheapest way to do this in C#: - /// /// Load raw 32bit floating point data from source. /// @@ -302,7 +271,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components public float[] ToArray() { - var result = new float[Size]; + float[] result = new float[Size]; this.ScaledCopyTo(result); return result; } @@ -434,102 +403,37 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } /// - /// Quantize the block. - /// - /// The block pointer. - /// The qt pointer. - /// Unzig pointer - public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr) - { - float* b = (float*)blockPtr; - float* qtp = (float*)qtPtr; - for (int qtIndex = 0; qtIndex < Size; qtIndex++) - { - byte blockIndex = unzigPtr[qtIndex]; - float* unzigPos = b + blockIndex; - - float val = *unzigPos; - val *= qtp[qtIndex]; - *unzigPos = val; - } - } - - /// - /// Quantize 'block' into 'dest' using the 'qt' quantization table: - /// Unzig the elements of block into dest, while dividing them by elements of qt and "pre-rounding" the values. - /// To finish the rounding it's enough to (int)-cast these values. + /// Quantize input block, apply zig-zag ordering and store result as 16bit integers. /// - /// Source block - /// Destination block - /// The quantization table - /// The 8x8 Unzig block. - public static unsafe void Quantize( - ref Block8x8F block, - ref Block8x8F dest, - ref Block8x8F qt, - ref ZigZag unZig) + /// Source block. + /// Destination block. + /// The quantization table. + public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt) { - for (int zig = 0; zig < Size; zig++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) { - dest[zig] = block[unZig[zig]]; + DivideIntoInt16_Avx2(ref block, ref qt, ref dest); + ZigZag.ApplyZigZagOrderingAvx(ref dest, ref dest); } - - DivideRoundAll(ref dest, ref qt); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) + else if (Ssse3.IsSupported) { - var vnegOne = Vector256.Create(-1f); - var vadd = Vector256.Create(.5F); - var vone = Vector256.Create(1f); - - for (int i = 0; i < RowCount; i++) - { - ref Vector256 aRow = ref Unsafe.Add(ref a.V0, i); - ref Vector256 bRow = ref Unsafe.Add(ref b.V0, i); - Vector256 voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aRow), vone), vadd); - aRow = Avx.Add(Avx.Divide(aRow, bRow), voff); - } + DivideIntoInt16_Sse2(ref block, ref qt, ref dest); + ZigZag.ApplyZigZagOrderingSse(ref dest, ref dest); } else #endif { - a.V0L = DivideRound(a.V0L, b.V0L); - a.V0R = DivideRound(a.V0R, b.V0R); - a.V1L = DivideRound(a.V1L, b.V1L); - a.V1R = DivideRound(a.V1R, b.V1R); - a.V2L = DivideRound(a.V2L, b.V2L); - a.V2R = DivideRound(a.V2R, b.V2R); - a.V3L = DivideRound(a.V3L, b.V3L); - a.V3R = DivideRound(a.V3R, b.V3R); - a.V4L = DivideRound(a.V4L, b.V4L); - a.V4R = DivideRound(a.V4R, b.V4R); - a.V5L = DivideRound(a.V5L, b.V5L); - a.V5R = DivideRound(a.V5R, b.V5R); - a.V6L = DivideRound(a.V6L, b.V6L); - a.V6R = DivideRound(a.V6R, b.V6R); - a.V7L = DivideRound(a.V7L, b.V7L); - a.V7R = DivideRound(a.V7R, b.V7R); + for (int i = 0; i < Size; i++) + { + // TODO: find a way to index block & qt matrices with natural order indices for performance? + int zig = ZigZag.ZigZagOrder[i]; + float divRes = block[zig] / qt[zig]; + dest[i] = (short)(divRes + (divRes > 0 ? 0.5f : -0.5f)); + } } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor) - { - var neg = new Vector4(-1); - var add = new Vector4(.5F); - - // sign(dividend) = max(min(dividend, 1), -1) - Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One); - - // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend) - return (dividend / divisor) + (sign * add); - } - public void RoundInto(ref Block8x8 dest) { for (int i = 0; i < Size; i++) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs index 70a446512..bbc4e40af 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs @@ -54,9 +54,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder /// private readonly HuffmanTable[] acHuffmanTables; - // The unzig data. - private ZigZag dctZigZag; - private HuffmanScanBuffer scanBuffer; private readonly SpectralConverter spectralConverter; @@ -74,7 +71,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder SpectralConverter converter, CancellationToken cancellationToken) { - this.dctZigZag = ZigZag.CreateUnzigTable(); this.stream = stream; this.spectralConverter = converter; this.cancellationToken = cancellationToken; @@ -477,7 +473,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder { ref short blockDataRef = ref Unsafe.As(ref block); ref HuffmanScanBuffer buffer = ref this.scanBuffer; - ref ZigZag zigzag = ref this.dctZigZag; // DC int t = buffer.DecodeHuffman(ref dcTable); @@ -502,7 +497,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder { i += r; s = buffer.Receive(s); - Unsafe.Add(ref blockDataRef, zigzag[i++]) = (short)s; + Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s; } else { @@ -556,7 +551,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder } ref HuffmanScanBuffer buffer = ref this.scanBuffer; - ref ZigZag zigzag = ref this.dctZigZag; int start = this.SpectralStart; int end = this.SpectralEnd; int low = this.SuccessiveLow; @@ -572,7 +566,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder if (s != 0) { s = buffer.Receive(s); - Unsafe.Add(ref blockDataRef, zigzag[i]) = (short)(s << low); + Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low); } else { @@ -602,7 +596,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder { // Refinement scan for these AC coefficients ref HuffmanScanBuffer buffer = ref this.scanBuffer; - ref ZigZag zigzag = ref this.dctZigZag; int start = this.SpectralStart; int end = this.SpectralEnd; @@ -649,7 +642,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder do { - ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]); + ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]); if (coef != 0) { buffer.CheckBits(); @@ -675,7 +668,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder if ((s != 0) && (k < 64)) { - Unsafe.Add(ref blockDataRef, zigzag[k]) = (short)s; + Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s; } } } @@ -684,7 +677,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder { for (; k <= end; k++) { - ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]); + ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]); if (coef != 0) { diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs index 391dac784..0b80acc5d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs @@ -22,7 +22,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder IJpegComponent[] Components { get; } /// - /// Gets the quantization tables, in zigzag order. + /// Gets the quantization tables, in natural order. /// Block8x8F[] QuantizationTables { get; } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs index 7cfbaddcc..00169d082 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs @@ -46,7 +46,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder public JpegBlockPostProcessor(IRawJpegData decoder, IJpegComponent component) { int qtIndex = component.QuantizationTableIndex; - this.DequantiazationTable = ZigZag.CreateDequantizationTable(ref decoder.QuantizationTables[qtIndex]); + this.DequantiazationTable = decoder.QuantizationTables[qtIndex]; this.subSamplingDivisors = component.SubSamplingDivisors; this.SourceBlock = default; diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index a6334e2da..8b61b66c9 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -96,6 +96,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder private Block8x8F temporalBlock1; private Block8x8F temporalBlock2; + private Block8x8 temporalShortBlock; /// /// The output stream. All attempted writes after the first error become no-ops. @@ -132,8 +133,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { this.huffmanTables = HuffmanLut.TheHuffmanLut; - var unzig = ZigZag.CreateUnzigTable(); - // ReSharper disable once InconsistentNaming int prevDCY = 0, prevDCCb = 0, prevDCCr = 0; @@ -156,22 +155,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder QuantIndex.Luminance, prevDCY, ref pixelConverter.Y, - ref luminanceQuantTable, - ref unzig); + ref luminanceQuantTable); prevDCCb = this.WriteBlock( QuantIndex.Chrominance, prevDCCb, ref pixelConverter.Cb, - ref chrominanceQuantTable, - ref unzig); + ref chrominanceQuantTable); prevDCCr = this.WriteBlock( QuantIndex.Chrominance, prevDCCr, ref pixelConverter.Cr, - ref chrominanceQuantTable, - ref unzig); + ref chrominanceQuantTable); if (this.IsFlushNeeded) { @@ -197,8 +193,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { this.huffmanTables = HuffmanLut.TheHuffmanLut; - var unzig = ZigZag.CreateUnzigTable(); - // ReSharper disable once InconsistentNaming int prevDCY = 0, prevDCCb = 0, prevDCCr = 0; ImageFrame frame = pixels.Frames.RootFrame; @@ -222,30 +216,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder QuantIndex.Luminance, prevDCY, ref pixelConverter.YLeft, - ref luminanceQuantTable, - ref unzig); + ref luminanceQuantTable); prevDCY = this.WriteBlock( QuantIndex.Luminance, prevDCY, ref pixelConverter.YRight, - ref luminanceQuantTable, - ref unzig); + ref luminanceQuantTable); } prevDCCb = this.WriteBlock( QuantIndex.Chrominance, prevDCCb, ref pixelConverter.Cb, - ref chrominanceQuantTable, - ref unzig); + ref chrominanceQuantTable); prevDCCr = this.WriteBlock( QuantIndex.Chrominance, prevDCCr, ref pixelConverter.Cr, - ref chrominanceQuantTable, - ref unzig); + ref chrominanceQuantTable); if (this.IsFlushNeeded) { @@ -269,8 +259,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { this.huffmanTables = HuffmanLut.TheHuffmanLut; - var unzig = ZigZag.CreateUnzigTable(); - // ReSharper disable once InconsistentNaming int prevDCY = 0; @@ -292,8 +280,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder QuantIndex.Luminance, prevDCY, ref pixelConverter.Y, - ref luminanceQuantTable, - ref unzig); + ref luminanceQuantTable); if (this.IsFlushNeeded) { @@ -320,28 +307,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder QuantIndex index, int prevDC, ref Block8x8F src, - ref Block8x8F quant, - ref ZigZag unZig) + ref Block8x8F quant) { ref Block8x8F refTemp1 = ref this.temporalBlock1; ref Block8x8F refTemp2 = ref this.temporalBlock2; + ref Block8x8 spectralBlock = ref this.temporalShortBlock; FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2); - Block8x8F.Quantize(ref refTemp1, ref refTemp2, ref quant, ref unZig); + Block8x8F.Quantize(ref refTemp1, ref spectralBlock, ref quant); // Emit the DC delta. - int dc = (int)refTemp2[0]; + int dc = spectralBlock[0]; this.EmitHuffRLE(this.huffmanTables[2 * (int)index].Values, 0, dc - prevDC); // Emit the AC components. int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values; int runLength = 0; - int lastValuableIndex = refTemp2.GetLastNonZeroIndex(); + int lastValuableIndex = spectralBlock.GetLastNonZeroIndex(); for (int zig = 1; zig <= lastValuableIndex; zig++) { - int ac = (int)refTemp2[zig]; + int ac = spectralBlock[zig]; if (ac == 0) { diff --git a/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs b/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs index 2ff56c63b..eab5e6a08 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs @@ -39,53 +39,59 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components public const int QualityEstimationConfidenceUpperThreshold = 98; /// - /// Gets the unscaled luminance quantization table in zig-zag order. Each - /// encoder copies and scales the tables according to its quality parameter. - /// The values are derived from ITU section K.1 after converting from natural to - /// zig-zag order. + /// Gets unscaled luminance quantization table. /// + /// + /// The values are derived from ITU section K.1. + /// // The C# compiler emits this as a compile-time constant embedded in the PE file. // This is effectively compiled down to: return new ReadOnlySpan(&data, length) // More details can be found: https://github.com/dotnet/roslyn/pull/24621 - public static ReadOnlySpan UnscaledQuant_Luminance => new byte[] + public static ReadOnlySpan LuminanceTable => new byte[] { - 16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24, - 40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60, - 57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80, - 109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112, - 100, 120, 92, 101, 103, 99, + 16, 11, 10, 16, 24, 40, 51, 61, + 12, 12, 14, 19, 26, 58, 60, 55, + 14, 13, 16, 24, 40, 57, 69, 56, + 14, 17, 22, 29, 51, 87, 80, 62, + 18, 22, 37, 56, 68, 109, 103, 77, + 24, 35, 55, 64, 81, 104, 113, 92, + 49, 64, 78, 87, 103, 121, 120, 101, + 72, 92, 95, 98, 112, 100, 103, 99, }; /// - /// Gets the unscaled chrominance quantization table in zig-zag order. Each - /// encoder copies and scales the tables according to its quality parameter. - /// The values are derived from ITU section K.1 after converting from natural to - /// zig-zag order. + /// Gets unscaled chrominance quantization table. /// + /// + /// The values are derived from ITU section K.1. + /// // The C# compiler emits this as a compile-time constant embedded in the PE file. // This is effectively compiled down to: return new ReadOnlySpan(&data, length) // More details can be found: https://github.com/dotnet/roslyn/pull/24621 - public static ReadOnlySpan UnscaledQuant_Chrominance => new byte[] + public static ReadOnlySpan ChrominanceTable => new byte[] { - 17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, + 17, 18, 24, 47, 99, 99, 99, 99, + 18, 21, 26, 66, 99, 99, 99, 99, + 24, 26, 56, 99, 99, 99, 99, 99, + 47, 66, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, }; /// Ported from JPEGsnoop: /// https://github.com/ImpulseAdventure/JPEGsnoop/blob/9732ee0961f100eb69bbff4a0c47438d5997abee/source/JfifDecode.cpp#L4570-L4694 /// - /// Estimates jpeg quality based on quantization table in zig-zag order. + /// Estimates jpeg quality based on standard quantization table. /// /// - /// This technically can be used with any given table but internal decoder code uses ITU spec tables: - /// and . + /// Technically, this can be used with any given table but internal decoder code uses ITU spec tables: + /// and . /// /// Input quantization table. - /// Quantization to estimate against. - /// Estimated quality + /// Natural order quantization table to estimate against. + /// Estimated quality. public static int EstimateQuality(ref Block8x8F table, ReadOnlySpan target) { // This method can be SIMD'ified if standard table is injected as Block8x8F. @@ -106,11 +112,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components int quality; for (int i = 0; i < Block8x8F.Size; i++) { - float coeff = table[i]; - int coeffInteger = (int)coeff; + int coeff = (int)table[i]; // Coefficients are actually int16 casted to float numbers so there's no truncating error. - if (coeffInteger != 0) + if (coeff != 0) { comparePercent = 100.0 * (table[i] / target[i]); } @@ -152,7 +157,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Estimated quality [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int EstimateLuminanceQuality(ref Block8x8F luminanceTable) - => EstimateQuality(ref luminanceTable, UnscaledQuant_Luminance); + => EstimateQuality(ref luminanceTable, LuminanceTable); /// /// Estimates jpeg quality based on quantization table in zig-zag order. @@ -161,7 +166,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Estimated quality [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int EstimateChrominanceQuality(ref Block8x8F chrominanceTable) - => EstimateQuality(ref chrominanceTable, UnscaledQuant_Chrominance); + => EstimateQuality(ref chrominanceTable, ChrominanceTable); [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int QualityToScale(int quality) @@ -185,10 +190,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Block8x8F ScaleLuminanceTable(int quality) - => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Luminance); + => ScaleQuantizationTable(scale: QualityToScale(quality), LuminanceTable); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Block8x8F ScaleChrominanceTable(int quality) - => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Chrominance); + => ScaleQuantizationTable(scale: QualityToScale(quality), ChrominanceTable); } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs new file mode 100644 index 000000000..066eb2846 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -0,0 +1,404 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +#if SUPPORTS_RUNTIME_INTRINSICS +using System; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components +{ + internal static partial class ZigZag + { + /// + /// Special byte value to zero out elements during Sse/Avx shuffle intrinsics. + /// + private const byte Z = 0xff; + + /// + /// Gets shuffle vectors for + /// zig zag implementation. + /// + private static ReadOnlySpan SseShuffleMasks => new byte[] + { + // 0_A + 0, 1, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, 6, 7, Z, Z, + // 0_B + Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5, + // 0_C + Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, + + // 1_A + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, 10, 11, + // 1_B + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z, Z, Z, + // 1_C + 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, + // 1_D + Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, Z, Z, + // 1_E + Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + + // 2_B + 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + // 2_C + Z, Z, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + // 2_D + Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + // 2_E + Z, Z, Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, + // 2_F + Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, + // 2_G + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, + + // 3_A + Z, Z, Z, Z, Z, Z, 12, 13, 14, 15, Z, Z, Z, Z, Z, Z, + // 3_B + Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z, + // 3_C + Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, + // 3_D/4_E + 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, + + // 4_F + Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z, + // 4_G + Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, + // 4_H + Z, Z, Z, Z, Z, Z, 0, 1, 2, 3, Z, Z, Z, Z, Z, Z, + + // 5_B + Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + // 5_C + Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z, + // 5_D + 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z, Z, Z, + // 5_E + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, + // 5_F + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, Z, Z, + // 5_G + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, + + // 6_D + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, + // 6_E + Z, Z, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, + // 6_F + Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, + // 6_G + Z, Z, Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + // 6_H + 4, 5, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + + // 7_F + Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, + // 7_G + 10, 11, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z, + // 7_H + Z, Z, 8, 9, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, 14, 15 + }; + + /// + /// Gets shuffle vectors for + /// zig zag implementation. + /// + private static ReadOnlySpan AvxShuffleMasks => new byte[] + { + // 01_AB/01_EF/23_CD - cross-lane + 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, + + // 01_AB - inner-lane + 0, 1, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, 6, 7, 12, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, 6, 7, + + // 01_CD/23_GH - cross-lane + 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, Z, Z, Z, Z, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, Z, Z, Z, Z, + + // 01_CD - inner-lane + Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, + + // 01_EF - inner-lane + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + + // 23_AB/45_CD/67_EF - cross-lane + 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, Z, Z, Z, Z, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, Z, Z, Z, Z, + + // 23_AB - inner-lane + 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, 2, 3, 8, 9, Z, Z, Z, Z, + + // 23_CD - inner-lane + Z, Z, 6, 7, 12, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 12, 13, + + // 23_EF - inner-lane + Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + + // 23_GH - inner-lane + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + + // 45_AB - inner-lane + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + + // 45_CD - inner-lane + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, Z, Z, 2, 3, 8, 9, Z, Z, Z, Z, Z, Z, + + // 45_EF - cross-lane + 1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, Z, Z, Z, Z, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, + + // 45_EF - inner-lane + 2, 3, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z, + + // 45_GH - inner-lane + Z, Z, Z, Z, 2, 3, 8, 9, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, + + // 67_CD - inner-lane + Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + + // 67_EF - inner-lane + Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, Z, Z, 2, 3, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, + + // 67_GH - inner-lane + 8, 9, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, 10, 11, 4, 5, Z, Z, 6, 7, 12, 13, 14, 15 + }; + + /// + /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics. + /// + /// + /// Requires Ssse3 support. + /// + /// Input matrix. + /// Matrix to store the result. Can be a reference to input matrix. + public static unsafe void ApplyZigZagOrderingSse(ref Block8x8 source, ref Block8x8 dest) + { + DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); + + fixed (byte* maskPtr = SseShuffleMasks) + { + Vector128 A = source.V0.AsByte(); + Vector128 B = source.V1.AsByte(); + Vector128 C = source.V2.AsByte(); + Vector128 D = source.V3.AsByte(); + Vector128 E = source.V4.AsByte(); + Vector128 F = source.V5.AsByte(); + Vector128 G = source.V6.AsByte(); + Vector128 H = source.V7.AsByte(); + + // row0 + Vector128 row0_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16(); + Vector128 row0_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16(); + Vector128 row0 = Sse2.Or(row0_A, row0_B); + Vector128 row0_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16(); + row0 = Sse2.Or(row0, row0_C); + + // row1 + Vector128 row1_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16(); + Vector128 row1_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16(); + Vector128 row1 = Sse2.Or(row1_A, row1_B); + Vector128 row1_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1_C); + Vector128 row1_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1_D); + Vector128 row1_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1_E); + + // row2 + Vector128 row2_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16(); + Vector128 row2_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16(); + Vector128 row2 = Sse2.Or(row2_B, row2_C); + Vector128 row2_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2_D); + Vector128 row2_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2_E); + Vector128 row2_F = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2_F); + Vector128 row2_G = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2_G); + + // row3 + Vector128 A_3 = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16(); + Vector128 B_3 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16(); + Vector128 row3 = Sse2.Or(A_3, B_3); + Vector128 C_3 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); + row3 = Sse2.Or(row3, C_3); + Vector128 D3_E4_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16)); + Vector128 D_3 = Ssse3.Shuffle(D, D3_E4_shuffleMask).AsInt16(); + row3 = Sse2.Or(row3, D_3); + + // row4 + Vector128 E_4 = Ssse3.Shuffle(E, D3_E4_shuffleMask).AsInt16(); + Vector128 F_4 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16(); + Vector128 row4 = Sse2.Or(E_4, F_4); + Vector128 G_4 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16(); + row4 = Sse2.Or(row4, G_4); + Vector128 H_4 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16(); + row4 = Sse2.Or(row4, H_4); + + // row5 + Vector128 B_5 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16(); + Vector128 C_5 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16(); + Vector128 row5 = Sse2.Or(B_5, C_5); + Vector128 D_5 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16(); + row5 = Sse2.Or(row5, D_5); + Vector128 E_5 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16(); + row5 = Sse2.Or(row5, E_5); + Vector128 F_5 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16(); + row5 = Sse2.Or(row5, F_5); + Vector128 G_5 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16(); + row5 = Sse2.Or(row5, G_5); + + // row6 + Vector128 D_6 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16(); + Vector128 E_6 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16(); + Vector128 row6 = Sse2.Or(D_6, E_6); + Vector128 F_6 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16(); + row6 = Sse2.Or(row6, F_6); + Vector128 G_6 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16(); + row6 = Sse2.Or(row6, G_6); + Vector128 H_6 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16(); + row6 = Sse2.Or(row6, H_6); + + // row7 + Vector128 F_7 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16(); + Vector128 G_7 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16(); + Vector128 row7 = Sse2.Or(F_7, G_7); + Vector128 H_7 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16(); + row7 = Sse2.Or(row7, H_7); + + dest.V0 = row0; + dest.V1 = row1; + dest.V2 = row2; + dest.V3 = row3; + dest.V4 = row4; + dest.V5 = row5; + dest.V6 = row6; + dest.V7 = row7; + } + } + + /// + /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics. + /// + /// + /// Requires Avx2 support. + /// + /// Input matrix. + /// Matrix to store the result. Can be a reference to input matrix. + public static unsafe void ApplyZigZagOrderingAvx(ref Block8x8 source, ref Block8x8 dest) + { + DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); + + fixed (byte* shuffleVectorsPtr = AvxShuffleMasks) + { + // 18 loads + // 10 cross-lane shuffles (permutations) + // 14 shuffles + // 10 bitwise or's + // 4 stores + + // A0 A1 A2 A3 A4 A5 A6 A7 | B0 B1 B2 B3 B4 B5 B6 B7 + // C0 C1 C2 C3 C4 C5 C6 C7 | D0 D1 D2 D3 D4 D5 D6 D7 + // E0 E1 E2 E3 E4 E5 E6 E7 | F0 F1 F2 F3 F4 F5 F6 F7 + // G0 G1 G2 G3 G4 G5 G6 G7 | H0 H1 H2 H3 H4 H5 H6 H7 + Vector256 AB = source.V01.AsByte(); + Vector256 CD = source.V23.AsByte(); + Vector256 EF = source.V45.AsByte(); + Vector256 GH = source.V67.AsByte(); + + // row01 - A0 A1 B0 C0 B1 A2 A3 B2 | C1 D0 E0 D1 C2 B3 A4 A5 + Vector256 AB01_EF01_CD23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); + + // row01_AB - (A0 A1) (B0 B1) (A2 A3) (B2 B3) | (B2 B3) (A4 A5) (X X) (X X) + Vector256 row01_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); + // row01_AB - (A0 A1) (B0 X) (B1 A2) (A3 B2) | (X X) (X X) (X B3) (A4 A5) + row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte(); + + Vector256 CD01_GH23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32(); + + // row01_CD - (C0 C1) (X X) (X X) (X X) | (C0 C1) (D0 D1) (C2 C3) (X X) + Vector256 row01_CD = Avx2.PermuteVar8x32(CD.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte(); + // row01_CD - (X X) (X C0) (X X) (X X) | (C1 D0) (X D1) (C2 X) (X X) + row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte(); + + // row01_EF - (E0 E1) (E2 E3) (F0 F1) (X X) | (E0 E1) (X X) (X X) (X X) + Vector256 row0123_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); + // row01_EF - (X X) (X X) (X X) (X X) | (X X) (E0 X) (X X) (X X) + Vector256 row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte(); + + Vector256 row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF); + + + // row23 - B4 C3 D2 E1 F0 G0 F1 E2 | D3 C4 B5 A6 A7 B6 C5 D4 + + Vector256 AB23_CD45_EF67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); + + // row23_AB - (B4 B5) (X X) (X X) (X X) | (B4 B5) (B6 B7) (A6 A7) (X X) + Vector256 row2345_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); + // row23_AB - (B4 X) (X X) (X X) (X X) | (X X) (B5 A6) (A7 B6) (X X) + Vector256 row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte(); + + // row23_CD - (C2 C3) (D2 D3) (X X) (X X) | (D2 D3) (C4 C5) (D4 D5) (X X) + Vector256 row23_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); + // row23_CD - (X C3) (D2 X) (X X) (X X) | (D3 C4) (X X) (X X) (C5 D4) + row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte(); + + // row23_EF - (X X) (X E1) (F0 X) (F1 E2) | (X X) (X X) (X X) (X X) + Vector256 row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte(); + + // row23_GH - (G0 G1) (G2 G3) (H0 H1) (X X) | (G2 G3) (X X) (X X) (X X) + Vector256 row2345_GH = Avx2.PermuteVar8x32(GH.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte(); + // row23_GH - (X X) (X X) (X G0) (X X) | (X X) (X X) (X X) (X X) + Vector256 row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte()); + + Vector256 row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH)); + + + // row45 - E3 F2 G1 H0 H1 G2 F3 E4 | D5 C6 B7 C7 D6 E5 F4 G3 + + // row45_AB - (X X) (X X) (X X) (X X) | (X X) (B7 X) (X X) (X X) + Vector256 row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte()); + + // row45_CD - (D6 D7) (X X) (X X) (X X) | (C6 C7) (D4 D5) (D6 D7) (X X) + Vector256 row4567_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); + // row45_CD - (X X) (X X) (X X) (X X) | (D5 C6) (X C7) (D6 X) (X X) + Vector256 row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte()); + + Vector256 EF45_GH67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32(); + + // row45_EF - (E2 E3) (E4 E5) (F2 F3) (X X) | (E4 E5) (F4 F5) (X X) (X X) + Vector256 row45_EF = Avx2.PermuteVar8x32(EF.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte(); + // row45_EF - (E3 F2) (X X) (X X) (F3 E4) | (X X) (X X) (X E5) (F4 X) + row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte()); + + // row45_GH - (X X) (G1 H0) (H1 G2) (X X) | (X X) (X X) (X X) (X G3) + Vector256 row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte()); + + Vector256 row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH)); + + + // row67 - H2 H3 G4 F5 E6 D7 E7 F6 | G5 H4 H5 G6 F7 G7 H6 H7 + + // row67_CD - (X X) (X X) (X D7) (X X) | (X X) (X X) (X X) (X X) + Vector256 row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte()); + + // row67_EF - (E6 E7) (F4 F5) (F6 F7) (X X) | (F6 F7) (X X) (X X) (X X) + Vector256 row67_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); + // row67_EF - (X X) (X F5) (E6 X) (E7 F6) | (X X) (X X) (F7 X) (X X) + row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte()); + + // row67_GH - (G4 G5) (H2 H3) (X X) (X X) | (G4 G5) (G6 G7) (H4 H5) (H6 H7) + Vector256 row67_GH = Avx2.PermuteVar8x32(GH.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte(); + // row67_GH - (H2 H3) (G4 X) (X X) (X X) | (G5 H4) (H5 G6) (X G7) (H6 H7) + row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte()); + + Vector256 row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH); + + dest.V01 = row01.AsInt16(); + dest.V23 = row23.AsInt16(); + dest.V45 = row45.AsInt16(); + dest.V67 = row67.AsInt16(); + } + } + } +} +#endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs index 737652d4e..c2b0fc5d0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs @@ -4,19 +4,17 @@ using System; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; namespace SixLabors.ImageSharp.Formats.Jpeg.Components { - /// - /// Holds the Jpeg UnZig array in a value/stack type. - /// Unzig maps from the zigzag ordering to the natural ordering. For example, - /// unzig[3] is the column and row of the fourth element in zigzag order. The - /// value is 16, which means first column (16%8 == 0) and third row (16/8 == 2). - /// - [StructLayout(LayoutKind.Sequential)] - internal unsafe struct ZigZag + internal static partial class ZigZag { /// + /// Gets span of zig-zag ordering indices. + /// + /// /// When reading corrupted data, the Huffman decoders could attempt /// to reference an entry beyond the end of this array (if the decoded /// zero run length reaches past the end of the block). To prevent @@ -25,20 +23,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// to be stored in location 63 of the block, not somewhere random. /// The worst case would be a run-length of 15, which means we need 16 /// fake entries. - /// - private const int Size = 64 + 16; - - /// - /// Copy of in a value type - /// - public fixed byte Data[Size]; - - /// - /// Gets the unzigs map, which maps from the zigzag ordering to the natural ordering. - /// For example, unzig[3] is the column and row of the fourth element in zigzag order. - /// The value is 16, which means first column (16%8 == 0) and third row (16/8 == 2). - /// - private static ReadOnlySpan Unzig => new byte[] + /// + public static ReadOnlySpan ZigZagOrder => new byte[] { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, @@ -48,53 +34,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, - 63, 63, 63, 63, 63, 63, 63, 63, // Extra entries for safety in decoder + + // Extra entries for safety in decoder + 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }; - - /// - /// Returns the value at the given index - /// - /// The index - /// The - public byte this[int idx] - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get - { - ref byte self = ref Unsafe.As(ref this); - return Unsafe.Add(ref self, idx); - } - } - - /// - /// Creates and fills an instance of with Jpeg unzig indices - /// - /// The new instance - public static ZigZag CreateUnzigTable() - { - ZigZag result = default; - ref byte sourceRef = ref MemoryMarshal.GetReference(Unzig); - ref byte destinationRef = ref Unsafe.AsRef(result.Data); - - Unzig.CopyTo(new Span(result.Data, Size)); - - return result; - } - - /// - /// Apply Zigging to the given quantization table, so it will be sufficient to multiply blocks for dequantizing them. - /// - public static Block8x8F CreateDequantizationTable(ref Block8x8F qt) - { - Block8x8F result = default; - - for (int i = 0; i < Block8x8F.Size; i++) - { - result[Unzig[i]] = qt[i]; - } - - return result; - } } } diff --git a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs index e94b07faa..477054264 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs @@ -740,9 +740,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg stream.Read(this.temp, 0, 64); remaining -= 64; + // Parsing quantization table & saving it in natural order for (int j = 0; j < 64; j++) { - table[j] = this.temp[j]; + table[ZigZag.ZigZagOrder[j]] = this.temp[j]; } break; @@ -760,9 +761,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg stream.Read(this.temp, 0, 128); remaining -= 128; + // Parsing quantization table & saving it in natural order for (int j = 0; j < 64; j++) { - table[j] = (this.temp[2 * j] << 8) | this.temp[(2 * j) + 1]; + table[ZigZag.ZigZagOrder[j]] = (this.temp[2 * j] << 8) | this.temp[(2 * j) + 1]; } break; diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs index 8c6726e65..85a2c6846 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs @@ -151,7 +151,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg dqt[offset++] = (byte)i; for (int j = 0; j < Block8x8F.Size; j++) { - dqt[offset++] = (byte)quant[j]; + dqt[offset++] = (byte)quant[ZigZag.ZigZagOrder[j]]; } } @@ -635,11 +635,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg /// Initializes quntization tables. /// /// + /// + /// Zig-zag ordering is NOT applied to the resulting tables. + /// + /// /// We take quality values in a hierarchical order: /// 1. Check if encoder has set quality - /// 2. Check if metadata has special table for encoding - /// 3. Check if metadata has set quality - /// 4. Take default quality value - 75 + /// 2. Check if metadata has set quality + /// 3. Take default quality value - 75 + /// /// /// Color components count. /// Jpeg metadata instance. diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index 42fdd603e..fc642dcc7 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -272,32 +272,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg this.CompareBlocks(expected, actual, 0); } + // TODO: intrinsic tests [Theory] - [InlineData(1)] - [InlineData(2)] - public unsafe void Quantize(int seed) + [InlineData(1, 2)] + [InlineData(2, 1)] + public void Quantize(int srcSeed, int qtSeed) { - var block = default(Block8x8F); - block.LoadFrom(Create8x8RoundedRandomFloatData(-2000, 2000, seed)); - - var qt = default(Block8x8F); - qt.LoadFrom(Create8x8RoundedRandomFloatData(-2000, 2000, seed)); - - var unzig = ZigZag.CreateUnzigTable(); + Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed); + Block8x8F quant = CreateRandomFloatBlock(-2000, 2000, qtSeed); - int* expectedResults = stackalloc int[Block8x8F.Size]; - ReferenceImplementations.QuantizeRational(&block, expectedResults, &qt, unzig.Data); + Block8x8 expected = default; + ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder); - var actualResults = default(Block8x8F); + Block8x8 actual = default; + Block8x8F.Quantize(ref source, ref actual, ref quant); - Block8x8F.Quantize(ref block, ref actualResults, ref qt, ref unzig); - - for (int i = 0; i < Block8x8F.Size; i++) + for (int i = 0; i < Block8x8.Size; i++) { - int expected = expectedResults[i]; - int actual = (int)actualResults[i]; - - Assert.Equal(expected, actual); + Assert.Equal(expected[i], actual[i]); } } @@ -368,48 +360,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX); } - [Theory] - [InlineData(1)] - [InlineData(2)] - [InlineData(3)] - public unsafe void DequantizeBlock(int seed) - { - Block8x8F original = CreateRandomFloatBlock(-500, 500, seed); - Block8x8F qt = CreateRandomFloatBlock(0, 10, seed + 42); - - var unzig = ZigZag.CreateUnzigTable(); - - Block8x8F expected = original; - Block8x8F actual = original; - - ReferenceImplementations.DequantizeBlock(&expected, &qt, unzig.Data); - Block8x8F.DequantizeBlock(&actual, &qt, unzig.Data); - - this.CompareBlocks(expected, actual, 0); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - [InlineData(3)] - public unsafe void ZigZag_CreateDequantizationTable_MultiplicationShouldQuantize(int seed) - { - Block8x8F original = CreateRandomFloatBlock(-500, 500, seed); - Block8x8F qt = CreateRandomFloatBlock(0, 10, seed + 42); - - var unzig = ZigZag.CreateUnzigTable(); - Block8x8F zigQt = ZigZag.CreateDequantizationTable(ref qt); - - Block8x8F expected = original; - Block8x8F actual = original; - - ReferenceImplementations.DequantizeBlock(&expected, &qt, unzig.Data); - - actual.MultiplyInPlace(ref zigQt); - - this.CompareBlocks(expected, actual, 0); - } - [Fact] public void AddToAllInPlace() { diff --git a/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs index 03f7020c0..4505ef538 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs @@ -21,7 +21,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Block8x8F table = JpegQuantization.ScaleLuminanceTable(quality); int estimatedQuality = JpegQuantization.EstimateLuminanceQuality(ref table); - Assert.True(quality.Equals(estimatedQuality), $"Failed to estimate luminance quality for standard table at quality level {quality}"); + Assert.True( + quality.Equals(estimatedQuality), + $"Failed to estimate luminance quality for standard table at quality level {quality}"); } } @@ -35,7 +37,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Block8x8F table = JpegQuantization.ScaleChrominanceTable(quality); int estimatedQuality = JpegQuantization.EstimateChrominanceQuality(ref table); - Assert.True(quality.Equals(estimatedQuality), $"Failed to estimate chrominance quality for standard table at quality level {quality}"); + Assert.True( + quality.Equals(estimatedQuality), + $"Failed to estimate chrominance quality for standard table at quality level {quality}"); } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs index 2c673f30e..aa98a7379 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs @@ -15,18 +15,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils /// internal static partial class ReferenceImplementations { - public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr) + public static void DequantizeBlock(ref Block8x8F block, ref Block8x8F qt, ReadOnlySpan zigzag) { - float* b = (float*)blockPtr; - float* qtp = (float*)qtPtr; - for (int qtIndex = 0; qtIndex < Block8x8F.Size; qtIndex++) + for (int i = 0; i < Block8x8F.Size; i++) { - byte i = unzigPtr[qtIndex]; - float* unzigPos = b + i; - - float val = *unzigPos; - val *= qtp[qtIndex]; - *unzigPos = val; + int zig = zigzag[i]; + block[zig] *= qt[i]; } } @@ -101,42 +95,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils /// /// Reference implementation to test . - /// Rounding is done used an integer-based algorithm defined in . /// - /// The input block - /// The destination block of integers - /// The quantization table - /// Pointer to - public static unsafe void QuantizeRational(Block8x8F* src, int* dest, Block8x8F* qt, byte* unzigPtr) + /// The input block. + /// The destination block of 16bit integers. + /// The quantization table. + /// Zig-Zag index sequence span. + public static void Quantize(ref Block8x8F src, ref Block8x8 dest, ref Block8x8F qt, ReadOnlySpan zigzag) { - float* s = (float*)src; - float* q = (float*)qt; - - for (int zig = 0; zig < Block8x8F.Size; zig++) + for (int i = 0; i < Block8x8F.Size; i++) { - int a = (int)s[unzigPtr[zig]]; - int b = (int)q[zig]; - - int val = RationalRound(a, b); - dest[zig] = val; + int zig = zigzag[i]; + dest[i] = (short)Math.Round(src[zig] / qt[zig], MidpointRounding.AwayFromZero); } } - - /// - /// Rounds a rational number defined as dividend/divisor into an integer. - /// - /// The dividend. - /// The divisor. - /// The rounded value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int RationalRound(int dividend, int divisor) - { - if (dividend >= 0) - { - return (dividend + (divisor >> 1)) / divisor; - } - - return -((-dividend + (divisor >> 1)) / divisor); - } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs index e03cf9958..39046438a 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs @@ -13,8 +13,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg public void ZigZagCanHandleAllPossibleCoefficients() { // Mimic the behaviour of the huffman scan decoder using all possible byte values - var block = new short[64]; - var zigzag = ZigZag.CreateUnzigTable(); + short[] block = new short[64]; for (int h = 0; h < 255; h++) { @@ -27,7 +26,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg if (s != 0) { i += r; - block[zigzag[i++]] = (short)s; + block[ZigZag.ZigZagOrder[i++]] = (short)s; } else { From a220b3d5b894724fb7722efae95cd75c83609edc Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sat, 28 Aug 2021 19:29:30 +0300 Subject: [PATCH 17/56] Removed obsolete code, tests cleanup --- .../Formats/Jpeg/Components/Block8x8F.cs | 57 ------- .../Formats/Jpeg/Components/ZigZag.cs | 4 - .../Formats/Jpg/Block8x8Tests.cs | 155 +++++++++++++++++- .../Formats/Jpg/HuffmanScanEncoderTests.cs | 152 ----------------- 4 files changed, 154 insertions(+), 214 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 79a35e2cd..b29c13e6e 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -768,62 +768,5 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components return true; } } - - /// - /// Returns index of the last non-zero element in this matrix. - /// - /// - /// Index of the last non-zero element. Returns -1 if all elements are equal to zero. - /// - [MethodImpl(InliningOptions.ShortMethod)] - public int GetLastNonZeroIndex() - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) - { - const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); - - Vector256 zero8 = Vector256.Zero; - - ref Vector256 mcuStride = ref Unsafe.As>(ref this); - - for (int i = 7; i >= 0; i--) - { - int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref mcuStride, i)), zero8).AsByte()); - - if (areEqual != equalityMask) - { - // Each 4 bits represents comparison operation for each 4-byte element in input vectors - // LSB represents first element in the stride - // MSB represents last element in the stride - // lzcnt operation would calculate number of zero numbers at the end - - // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements - // So we need to invert it - int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual); - - // As input number is represented by 4 bits in the mask, we need to divide lzcnt result by 4 - // to get the exact number of zero elements in the stride - int strideRelativeIndex = 7 - (lzcnt / 4); - return (i * 8) + strideRelativeIndex; - } - } - - return -1; - } - else -#endif - { - int index = Size - 1; - ref float elemRef = ref Unsafe.As(ref this); - - while (index >= 0 && (int)Unsafe.Add(ref elemRef, index) == 0) - { - index--; - } - - return index; - } - } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs index c2b0fc5d0..e519a8a1d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs @@ -2,10 +2,6 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; namespace SixLabors.ImageSharp.Formats.Jpeg.Components { diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs index afe71ad04..6d73181cb 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs @@ -1,9 +1,10 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using SixLabors.ImageSharp.Formats.Jpeg.Components; using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils; - +using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; using Xunit.Abstractions; @@ -121,5 +122,157 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Assert.Equal(15, d); } + + [Fact] + public void GetLastNonZeroIndex_AllZero() + { + static void RunTest() + { + Block8x8 data = default; + + int expected = -1; + + int actual = data.GetLastNonZeroIndex(); + + Assert.Equal(expected, actual); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + + [Fact] + public void GetLastNonZeroIndex_AllNonZero() + { + static void RunTest() + { + Block8x8 data = default; + for (int i = 0; i < Block8x8.Size; i++) + { + data[i] = 10; + } + + int expected = Block8x8.Size - 1; + + int actual = data.GetLastNonZeroIndex(); + + Assert.Equal(expected, actual); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + + [Theory] + [InlineData(1)] + [InlineData(2)] + public void GetLastNonZeroIndex_RandomFilledSingle(int seed) + { + static void RunTest(string seedSerialized) + { + int seed = FeatureTestRunner.Deserialize(seedSerialized); + var rng = new Random(seed); + + for (int i = 0; i < 1000; i++) + { + Block8x8 data = default; + + int setIndex = rng.Next(1, Block8x8.Size); + data[setIndex] = (short)rng.Next(-2000, 2000); + + int expected = setIndex; + + int actual = data.GetLastNonZeroIndex(); + + Assert.Equal(expected, actual); + } + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + seed, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + + [Theory] + [InlineData(1)] + [InlineData(2)] + public void GetLastNonZeroIndex_RandomFilledPartially(int seed) + { + static void RunTest(string seedSerialized) + { + int seed = FeatureTestRunner.Deserialize(seedSerialized); + var rng = new Random(seed); + + for (int i = 0; i < 1000; i++) + { + Block8x8 data = default; + + int lastIndex = rng.Next(1, Block8x8.Size); + short fillValue = (short)rng.Next(-2000, 2000); + for (int dataIndex = 0; dataIndex <= lastIndex; dataIndex++) + { + data[dataIndex] = fillValue; + } + + int expected = lastIndex; + + int actual = data.GetLastNonZeroIndex(); + + Assert.Equal(expected, actual); + } + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + seed, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + + [Theory] + [InlineData(1)] + [InlineData(2)] + public void GetLastNonZeroIndex_RandomFilledFragmented(int seed) + { + static void RunTest(string seedSerialized) + { + int seed = FeatureTestRunner.Deserialize(seedSerialized); + var rng = new Random(seed); + + for (int i = 0; i < 1000; i++) + { + Block8x8 data = default; + + short fillValue = (short)rng.Next(-2000, 2000); + + // first filled chunk + int lastIndex1 = rng.Next(1, Block8x8F.Size / 2); + for (int dataIndex = 0; dataIndex <= lastIndex1; dataIndex++) + { + data[dataIndex] = fillValue; + } + + // second filled chunk, there might be a spot with zero(s) between first and second chunk + int lastIndex2 = rng.Next(lastIndex1 + 1, Block8x8F.Size); + for (int dataIndex = 0; dataIndex <= lastIndex2; dataIndex++) + { + data[dataIndex] = fillValue; + } + + int expected = lastIndex2; + + int actual = data.GetLastNonZeroIndex(); + + Assert.Equal(expected, actual); + } + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + seed, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs index a3aa957ee..42f2fa0d5 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs @@ -85,157 +85,5 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Assert.Equal(expected, actual); } } - - [Fact] - public void GetLastNonZeroIndex_AllZero() - { - static void RunTest() - { - Block8x8F data = default; - - int expectedLessThan = 1; - - int actual = data.GetLastNonZeroIndex(); - - Assert.True(actual < expectedLessThan); - } - - FeatureTestRunner.RunWithHwIntrinsicsFeature( - RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); - } - - [Fact] - public void GetLastNonZeroIndex_AllNonZero() - { - static void RunTest() - { - Block8x8F data = default; - for (int i = 0; i < Block8x8F.Size; i++) - { - data[i] = 10; - } - - int expected = Block8x8F.Size - 1; - - int actual = data.GetLastNonZeroIndex(); - - Assert.Equal(expected, actual); - } - - FeatureTestRunner.RunWithHwIntrinsicsFeature( - RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void GetLastNonZeroIndex_RandomFilledSingle(int seed) - { - static void RunTest(string seedSerialized) - { - int seed = FeatureTestRunner.Deserialize(seedSerialized); - var rng = new Random(seed); - - for (int i = 0; i < 1000; i++) - { - Block8x8F data = default; - - int setIndex = rng.Next(1, Block8x8F.Size); - data[setIndex] = rng.Next(); - - int expected = setIndex; - - int actual = data.GetLastNonZeroIndex(); - - Assert.Equal(expected, actual); - } - } - - FeatureTestRunner.RunWithHwIntrinsicsFeature( - RunTest, - seed, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void GetLastNonZeroIndex_RandomFilledPartially(int seed) - { - static void RunTest(string seedSerialized) - { - int seed = FeatureTestRunner.Deserialize(seedSerialized); - var rng = new Random(seed); - - for (int i = 0; i < 1000; i++) - { - Block8x8F data = default; - - int lastIndex = rng.Next(1, Block8x8F.Size); - int fillValue = rng.Next(); - for (int dataIndex = 0; dataIndex <= lastIndex; dataIndex++) - { - data[dataIndex] = fillValue; - } - - int expected = lastIndex; - - int actual = data.GetLastNonZeroIndex(); - - Assert.Equal(expected, actual); - } - } - - FeatureTestRunner.RunWithHwIntrinsicsFeature( - RunTest, - seed, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void GetLastNonZeroIndex_RandomFilledFragmented(int seed) - { - static void RunTest(string seedSerialized) - { - int seed = FeatureTestRunner.Deserialize(seedSerialized); - var rng = new Random(seed); - - for (int i = 0; i < 1000; i++) - { - Block8x8F data = default; - - int fillValue = rng.Next(); - - // first filled chunk - int lastIndex1 = rng.Next(1, Block8x8F.Size / 2); - for (int dataIndex = 0; dataIndex <= lastIndex1; dataIndex++) - { - data[dataIndex] = fillValue; - } - - // second filled chunk, there might be a spot with zero(s) between first and second chunk - int lastIndex2 = rng.Next(lastIndex1 + 1, Block8x8F.Size); - for (int dataIndex = 0; dataIndex <= lastIndex2; dataIndex++) - { - data[dataIndex] = fillValue; - } - - int expected = lastIndex2; - - int actual = data.GetLastNonZeroIndex(); - - Assert.Equal(expected, actual); - } - } - - FeatureTestRunner.RunWithHwIntrinsicsFeature( - RunTest, - seed, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); - } } } From cc99da35bf20804ae57000e15bb75b4c330a8679 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 29 Aug 2021 05:35:58 +0300 Subject: [PATCH 18/56] Added DCT in place --- .../Decoder/JpegBlockPostProcessor.cs | 24 ++++------ .../Components/Encoder/HuffmanScanEncoder.cs | 22 +++++----- .../Jpeg/Components/FastFloatingPointDCT.cs | 44 +++++++++++++++---- .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs | 2 +- 4 files changed, 57 insertions(+), 35 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs index 00169d082..cf5fdd2df 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs @@ -19,14 +19,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder public Block8x8F SourceBlock; /// - /// Temporal block 1 to store intermediate and/or final computation results. + /// Temporal block to store intermediate computation results. /// - public Block8x8F WorkspaceBlock1; - - /// - /// Temporal block 2 to store intermediate and/or final computation results. - /// - public Block8x8F WorkspaceBlock2; + public Block8x8F WorkspaceBlock; /// /// The quantization table as . @@ -50,8 +45,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder this.subSamplingDivisors = component.SubSamplingDivisors; this.SourceBlock = default; - this.WorkspaceBlock1 = default; - this.WorkspaceBlock2 = default; + this.WorkspaceBlock = default; } /// @@ -71,20 +65,20 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder int destAreaStride, float maximumValue) { - ref Block8x8F b = ref this.SourceBlock; - b.LoadFrom(ref sourceBlock); + ref Block8x8F block = ref this.SourceBlock; + block.LoadFrom(ref sourceBlock); // Dequantize: - b.MultiplyInPlace(ref this.DequantiazationTable); + block.MultiplyInPlace(ref this.DequantiazationTable); - FastFloatingPointDCT.TransformIDCT(ref b, ref this.WorkspaceBlock1, ref this.WorkspaceBlock2); + FastFloatingPointDCT.TransformInplaceIDCT(ref block, ref this.WorkspaceBlock); // To conform better to libjpeg we actually NEED TO loose precision here. // This is because they store blocks as Int16 between all the operations. // To be "more accurate", we need to emulate this by rounding! - this.WorkspaceBlock1.NormalizeColorsAndRoundInPlace(maximumValue); + block.NormalizeColorsAndRoundInPlace(maximumValue); - this.WorkspaceBlock1.ScaledCopyTo( + block.ScaledCopyTo( ref destAreaOrigin, destAreaStride, this.subSamplingDivisors.Width, diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 8b61b66c9..4f5ffb3f8 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -94,8 +94,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// private int bitCount; - private Block8x8F temporalBlock1; - private Block8x8F temporalBlock2; + private Block8x8F temporalBlock; private Block8x8 temporalShortBlock; /// @@ -299,23 +298,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// The quantization table index. /// The previous DC value. - /// Source block - /// Quantization table - /// The 8x8 Unzig block. + /// Source block. + /// Quantization table. /// The . private int WriteBlock( QuantIndex index, int prevDC, - ref Block8x8F src, + ref Block8x8F block, ref Block8x8F quant) { - ref Block8x8F refTemp1 = ref this.temporalBlock1; - ref Block8x8F refTemp2 = ref this.temporalBlock2; + ref Block8x8F refTemp = ref this.temporalBlock; ref Block8x8 spectralBlock = ref this.temporalShortBlock; - FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2); + // Shifting level from 0..255 to -128..127 + block.AddInPlace(-128f); - Block8x8F.Quantize(ref refTemp1, ref spectralBlock, ref quant); + // Discrete cosine transform + FastFloatingPointDCT.TransformInplaceFDCT(ref block, ref refTemp); + + // Quantization + Block8x8F.Quantize(ref block, ref spectralBlock, ref quant); // Emit the DC delta. int dc = spectralBlock[0]; diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 0f569b5da..dd46a83e3 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -276,28 +276,36 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Source /// Destination /// Temporary block provided by the caller for optimization - /// If true, a constant -128.0 offset is applied for all values before FDCT public static void TransformFDCT( ref Block8x8F src, ref Block8x8F dest, - ref Block8x8F temp, - bool offsetSourceByNeg128 = true) + ref Block8x8F temp) { src.TransposeInto(ref temp); - if (offsetSourceByNeg128) - { - temp.AddInPlace(-128F); - } - FDCT8x8(ref temp, ref dest); dest.TransposeInto(ref temp); - FDCT8x8(ref temp, ref dest); dest.MultiplyInPlace(C_0_125); } + /// + /// Apply floating point FDCT inplace. + /// + /// Input matrix. + /// Matrix to store temporal results. + public static void TransformInplaceFDCT(ref Block8x8F matrix, ref Block8x8F temp) + { + matrix.TransposeInto(ref temp); + FDCT8x8(ref temp, ref matrix); + + matrix.TransposeInto(ref temp); + FDCT8x8(ref temp, ref matrix); + + matrix.MultiplyInPlace(C_0_125); + } + /// /// Performs 8x8 matrix Inverse Discrete Cosine Transform /// @@ -510,5 +518,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? dest.MultiplyInPlace(C_0_125); } + + /// + /// Apply floating point IDCT inplace. + /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. + /// + /// Input matrix. + /// Matrix to store temporal results. + public static void TransformInplaceIDCT(ref Block8x8F block, ref Block8x8F temp) + { + block.TransposeInto(ref temp); + + IDCT8x8(ref temp, ref block); + block.TransposeInto(ref temp); + IDCT8x8(ref temp, ref block); + + // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? + block.MultiplyInPlace(C_0_125); + } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs index d49a6498c..34ca7f9eb 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs @@ -310,7 +310,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true); // testee - FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false); + FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2); var actualDest = new float[64]; destBlock.ScaledCopyTo(actualDest); From 839da83f17b55e97fe96720e754eb4a60d2cd302 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 31 Aug 2021 19:19:00 +0300 Subject: [PATCH 19/56] Update sandbox --- .../Program.cs | 78 +++++++------------ 1 file changed, 28 insertions(+), 50 deletions(-) diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs index bdba1bef6..ef41294bc 100644 --- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs +++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs @@ -34,70 +34,46 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox /// public static void Main(string[] args) { - /* Master */ - // Elapsed: 5431ms across 200 iterations - // Average: 27,155ms - - /* Inserting stuff bytes later */ - // Elapsed: 5300ms across 200 iterations - // Average: 26,5ms - - /* Flush if check */ - // Elapsed: 5209ms across 200 iterations - // Average: 26,045ms - - /* [INVALID] int32 flush - invalid flush order */ - // Elapsed: 4784ms across 200 iterations - // Average: 23,92ms - - /* int32 flush - correct flush order */ - // Elapsed: 5049ms across 200 iterations - // Average: 25,245ms - - /* int32 flush - identical file output */ - // Elapsed: 4800ms across 200 iterations - // Average: 24.00ms - - /* int32 flush - optimized huffman storage & reduced instructions per Emit() */ - // Elapsed: 4680ms across 200 iterations - // Average: 23,4ms - - /* int32 flush - merged prefix & value Emit() call */ - // Elapsed: 4644ms across 200 iterations - // Average: 23,22ms - - - /* Fixed last valuable index calculation */ - // Elapsed: 4606ms across 200 iterations - // Average: 23,03ms - - /* Intrinsic last valuable index */ - // Elapsed: 4519ms across 200 iterations - // Average: 22,595ms - - BenchmarkEncoder("uniform_size", 200, 100); - - //ReEncodeImage("uniform_size", 100); + BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio444); + //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCr, JpegSubsample.Ratio444); + //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCr, JpegSubsample.Ratio444); + //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCr, JpegSubsample.Ratio444); + + //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio420); + //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCr, JpegSubsample.Ratio420); + //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCr, JpegSubsample.Ratio420); + //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCr, JpegSubsample.Ratio420); + + //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.Luminance, JpegSubsample.Ratio444); + //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.Luminance, JpegSubsample.Ratio444); + //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.Luminance, JpegSubsample.Ratio444); + //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.Luminance, JpegSubsample.Ratio444); + + //ReEncodeImage("snow_main", 100); + //ReEncodeImage("snow_main", 90); + //ReEncodeImage("snow_main", 75); + //ReEncodeImage("snow_main", 50); Console.WriteLine("Done."); } const string pathTemplate = "C:\\Users\\pl4nu\\Downloads\\{0}.jpg"; - private static void BenchmarkEncoder(string fileName, int iterations, int quality) + private static void BenchmarkEncoder(string fileName, int iterations, int quality, JpegColorType color, JpegSubsample subsample) { string loadPath = String.Format(pathTemplate, fileName); + using var inputStream = new FileStream(loadPath, FileMode.Open); using var saveStream = new MemoryStream(); var decoder = new JpegDecoder { IgnoreMetadata = true }; - using Image img = decoder.Decode(Configuration.Default, new FileStream(loadPath, FileMode.Open)); + using Image img = decoder.Decode(Configuration.Default, inputStream); var encoder = new JpegEncoder() { Quality = quality, - ColorType = JpegColorType.YCbCr, - Subsample = JpegSubsample.Ratio444 + ColorType = color, + Subsample = subsample }; Stopwatch sw = new Stopwatch(); @@ -109,7 +85,9 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox } sw.Stop(); - Console.WriteLine($"// Elapsed: {sw.ElapsedMilliseconds}ms across {iterations} iterations\n// Average: {(double)sw.ElapsedMilliseconds / iterations}ms"); + Console.WriteLine($"// Encoding q={quality} | color={color} | sub={subsample}\n" + + $"// Elapsed: {sw.ElapsedMilliseconds}ms across {iterations} iterations\n" + + $"// Average: {(double)sw.ElapsedMilliseconds / iterations}ms"); } private static void ReEncodeImage(string fileName, int quality) @@ -117,7 +95,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox string loadPath = String.Format(pathTemplate, fileName); using Image img = Image.Load(loadPath); - string savePath = String.Format(pathTemplate, $"testSave_{fileName}"); + string savePath = String.Format(pathTemplate, $"q{quality}_test_{fileName}"); var encoder = new JpegEncoder() { Quality = quality, From e3d328053b9e1f426acc9f14c79be55cff8dda8c Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 6 Sep 2021 07:42:51 +0300 Subject: [PATCH 20/56] 1 --- shared-infrastructure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared-infrastructure b/shared-infrastructure index 9b94ebc4b..f48ab8291 160000 --- a/shared-infrastructure +++ b/shared-infrastructure @@ -1 +1 @@ -Subproject commit 9b94ebc4be9b7a8d7620c257e6ee485455973332 +Subproject commit f48ab829167c42c69242ed0d303683232fbfccd1 From 81204d3fcb481d7da9427dc6b9d6cbac65d3880a Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 6 Sep 2021 08:10:36 +0300 Subject: [PATCH 21/56] Fixed switch for color type --- .../Formats/Jpeg/JpegEncoderCore.cs | 40 ++++++++----------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs index 1a911ecb0..6ff887667 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs @@ -131,29 +131,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg this.WriteStartOfScan(componentCount, componentIds); // Write the scan compressed data. - if (this.colorType == JpegColorType.Luminance) - { - // luminance quantization table only - new HuffmanScanEncoder(1, stream).EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken); - } - else - { - // luminance and chrominance quantization tables. - switch (this.colorType) - { - case JpegColorType.YCbCrRatio444: - new HuffmanScanEncoder(3, stream).Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); - break; - case JpegColorType.YCbCrRatio420: - new HuffmanScanEncoder(6, stream).Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); - break; - case JpegColorType.Luminance: - new HuffmanScanEncoder(1, stream).EncodeGrayscale(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); - break; - case JpegColorType.Rgb: - new HuffmanScanEncoder(3, stream).EncodeRgb(image, ref luminanceQuantTable, cancellationToken); - break; - } + switch (this.colorType) + { + case JpegColorType.YCbCrRatio444: + new HuffmanScanEncoder(3, stream).Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); + break; + case JpegColorType.YCbCrRatio420: + new HuffmanScanEncoder(6, stream).Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); + break; + case JpegColorType.Luminance: + new HuffmanScanEncoder(1, stream).EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken); + break; + case JpegColorType.Rgb: + new HuffmanScanEncoder(3, stream).EncodeRgb(image, ref luminanceQuantTable, cancellationToken); + break; + default: + // all other non-supported color types are checked at the start of this method + break; } // Write the End Of Image marker. From 7a21a889446027cd81ee84dbd29cca74cb9a3642 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 6 Sep 2021 08:55:22 +0300 Subject: [PATCH 22/56] Fixed failing tests --- .../Components/Encoder/HuffmanScanEncoder.cs | 18 +++++++++--------- .../Formats/Jpg/Block8x8Tests.cs | 18 ++++++++++-------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 8e799e98b..db0bc32ae 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -303,8 +303,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { this.huffmanTables = HuffmanLut.TheHuffmanLut; - var unzig = ZigZag.CreateUnzigTable(); - // ReSharper disable once InconsistentNaming int prevDCR = 0, prevDCG = 0, prevDCB = 0; @@ -327,26 +325,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder QuantIndex.Luminance, prevDCR, ref pixelConverter.R, - ref luminanceQuantTable, - ref unzig); + ref luminanceQuantTable); prevDCG = this.WriteBlock( QuantIndex.Luminance, prevDCG, ref pixelConverter.G, - ref luminanceQuantTable, - ref unzig); + ref luminanceQuantTable); prevDCB = this.WriteBlock( QuantIndex.Luminance, prevDCB, ref pixelConverter.B, - ref luminanceQuantTable, - ref unzig); + ref luminanceQuantTable); + + if (this.IsFlushNeeded) + { + this.FlushToStream(); + } } } - this.FlushInternalBuffer(); + this.FlushRemainingBytes(); } /// diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs index 6d73181cb..69375ae1b 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs @@ -248,24 +248,26 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg short fillValue = (short)rng.Next(-2000, 2000); // first filled chunk - int lastIndex1 = rng.Next(1, Block8x8F.Size / 2); - for (int dataIndex = 0; dataIndex <= lastIndex1; dataIndex++) + int firstChunkStart = rng.Next(0, Block8x8.Size / 2); + int firstChunkEnd = rng.Next(firstChunkStart, Block8x8.Size / 2); + for (int dataIdx = firstChunkStart; dataIdx <= firstChunkEnd; dataIdx++) { - data[dataIndex] = fillValue; + data[dataIdx] = fillValue; } // second filled chunk, there might be a spot with zero(s) between first and second chunk - int lastIndex2 = rng.Next(lastIndex1 + 1, Block8x8F.Size); - for (int dataIndex = 0; dataIndex <= lastIndex2; dataIndex++) + int secondChunkStart = rng.Next(firstChunkEnd, Block8x8.Size); + int secondChunkEnd = rng.Next(secondChunkStart, Block8x8.Size); + for (int dataIdx = secondChunkStart; dataIdx <= secondChunkEnd; dataIdx++) { - data[dataIndex] = fillValue; + data[dataIdx] = fillValue; } - int expected = lastIndex2; + int expected = secondChunkEnd; int actual = data.GetLastNonZeroIndex(); - Assert.Equal(expected, actual); + Assert.True(expected == actual, $"Expected: {expected}\nActual: {actual}\nInput matrix: {data}"); } } From 4d5886680fd6a5e4651024846b0dd177b276816f Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 6 Sep 2021 08:55:27 +0300 Subject: [PATCH 23/56] Fixed sandbox --- .../Program.cs | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs index ef41294bc..471251c2e 100644 --- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs +++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs @@ -34,7 +34,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox /// public static void Main(string[] args) { - BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio444); + //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio444); //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCr, JpegSubsample.Ratio444); //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCr, JpegSubsample.Ratio444); //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCr, JpegSubsample.Ratio444); @@ -49,17 +49,17 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.Luminance, JpegSubsample.Ratio444); //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.Luminance, JpegSubsample.Ratio444); - //ReEncodeImage("snow_main", 100); - //ReEncodeImage("snow_main", 90); - //ReEncodeImage("snow_main", 75); - //ReEncodeImage("snow_main", 50); + ReEncodeImage("snow_main", 100); + ReEncodeImage("snow_main", 90); + ReEncodeImage("snow_main", 75); + ReEncodeImage("snow_main", 50); Console.WriteLine("Done."); } const string pathTemplate = "C:\\Users\\pl4nu\\Downloads\\{0}.jpg"; - private static void BenchmarkEncoder(string fileName, int iterations, int quality, JpegColorType color, JpegSubsample subsample) + private static void BenchmarkEncoder(string fileName, int iterations, int quality, JpegColorType color) { string loadPath = String.Format(pathTemplate, fileName); @@ -72,8 +72,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox var encoder = new JpegEncoder() { Quality = quality, - ColorType = color, - Subsample = subsample + ColorType = color }; Stopwatch sw = new Stopwatch(); @@ -85,7 +84,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox } sw.Stop(); - Console.WriteLine($"// Encoding q={quality} | color={color} | sub={subsample}\n" + + Console.WriteLine($"// Encoding q={quality} | color={color}\n" + $"// Elapsed: {sw.ElapsedMilliseconds}ms across {iterations} iterations\n" + $"// Average: {(double)sw.ElapsedMilliseconds / iterations}ms"); } @@ -99,8 +98,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox var encoder = new JpegEncoder() { Quality = quality, - ColorType = JpegColorType.YCbCr, - Subsample = JpegSubsample.Ratio444 + ColorType = JpegColorType.Rgb }; img.SaveAsJpeg(savePath, encoder); } From 0b55bed262d75aac144e295bf83b98d1cb3ae142 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 7 Sep 2021 04:12:56 +0300 Subject: [PATCH 24/56] Slightly improved tiff decoding with jpeg data, removed unnecessary GC pressure --- .../Jpeg/Components/Decoder/SpectralConverter.cs | 2 +- .../Compression/Decompressors/JpegTiffCompression.cs | 11 +++++------ .../Decompressors/RgbJpegSpectralConverter.cs | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs index 23bb01409..e975b11fb 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs @@ -39,6 +39,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder /// The jpeg frame with the color space to convert to. /// The raw JPEG data. /// The color converter. - public virtual JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision); + protected virtual JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision); } } diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs index bd1c496b4..e764c014d 100644 --- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs +++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs @@ -65,22 +65,21 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors scanDecoder.ResetInterval = 0; jpegDecoder.ParseStream(stream, scanDecoder, CancellationToken.None); - using var image = new Image(this.configuration, spectralConverter.PixelBuffer, new ImageMetadata()); - CopyImageBytesToBuffer(buffer, image); + CopyImageBytesToBuffer(buffer, spectralConverter.PixelBuffer); } else { using var image = Image.Load(stream); - CopyImageBytesToBuffer(buffer, image); + CopyImageBytesToBuffer(buffer, image.Frames.RootFrame.PixelBuffer); } } - private static void CopyImageBytesToBuffer(Span buffer, Image image) + private static void CopyImageBytesToBuffer(Span buffer, Buffer2D pixelBuffer) { int offset = 0; - for (int y = 0; y < image.Height; y++) + for (int y = 0; y < pixelBuffer.Height; y++) { - Span pixelRowSpan = image.GetPixelRowSpan(y); + Span pixelRowSpan = pixelBuffer.GetRowSpan(y); Span rgbBytes = MemoryMarshal.AsBytes(pixelRowSpan); rgbBytes.CopyTo(buffer.Slice(offset)); offset += rgbBytes.Length; diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs index 45be3dd03..aefec7fa3 100644 --- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs +++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs @@ -28,6 +28,6 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors } /// - public override JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(JpegColorSpace.RGB, frame.Precision); + protected override JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(JpegColorSpace.RGB, frame.Precision); } } From 17ca003babe826074ee503432cc73b4ac2b872fa Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 7 Sep 2021 05:57:35 +0300 Subject: [PATCH 25/56] Fixed sandbox --- .../Program.cs | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs index 471251c2e..7f1817e5d 100644 --- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs +++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs @@ -34,25 +34,25 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox /// public static void Main(string[] args) { - //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio444); - //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCr, JpegSubsample.Ratio444); - //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCr, JpegSubsample.Ratio444); - //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCr, JpegSubsample.Ratio444); - - //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCr, JpegSubsample.Ratio420); - //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCr, JpegSubsample.Ratio420); - //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCr, JpegSubsample.Ratio420); - //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCr, JpegSubsample.Ratio420); - - //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.Luminance, JpegSubsample.Ratio444); - //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.Luminance, JpegSubsample.Ratio444); - //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.Luminance, JpegSubsample.Ratio444); - //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.Luminance, JpegSubsample.Ratio444); - - ReEncodeImage("snow_main", 100); - ReEncodeImage("snow_main", 90); - ReEncodeImage("snow_main", 75); - ReEncodeImage("snow_main", 50); + BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCrRatio444); + BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCrRatio444); + BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCrRatio444); + BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCrRatio444); + + //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCrRatio420); + //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCrRatio420); + //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCrRatio420); + //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCrRatio420); + + //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.Luminance); + //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.Luminance); + //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.Luminance); + //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.Luminance); + + //ReEncodeImage("snow_main", 100); + //ReEncodeImage("snow_main", 90); + //ReEncodeImage("snow_main", 75); + //ReEncodeImage("snow_main", 50); Console.WriteLine("Done."); } @@ -98,7 +98,7 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox var encoder = new JpegEncoder() { Quality = quality, - ColorType = JpegColorType.Rgb + ColorType = JpegColorType.YCbCrRatio444 }; img.SaveAsJpeg(savePath, encoder); } From ea09d59e083e1f8365a6df7eabc01479ab9037e4 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 7 Sep 2021 07:15:04 +0300 Subject: [PATCH 26/56] Rolled back to original implementation for rounding via scalar code --- .../Formats/Jpeg/Components/Block8x8F.cs | 45 ++++++++++++++++--- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 6606acdd6..2656f07ca 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -424,16 +424,47 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components else #endif { - for (int i = 0; i < Size; i++) - { - // TODO: find a way to index block & qt matrices with natural order indices for performance? - int zig = ZigZag.ZigZagOrder[i]; - float divRes = block[zig] / qt[zig]; - dest[i] = (short)(divRes + (divRes > 0 ? 0.5f : -0.5f)); - } + Divide(ref block, ref qt); + block.RoundInto(ref dest); } } + [MethodImpl(InliningOptions.ShortMethod)] + private static void Divide(ref Block8x8F a, ref Block8x8F b) + { + a.V0L /= b.V0L; + a.V0R /= b.V0R; + a.V1L /= b.V1L; + a.V1R /= b.V1R; + a.V2L /= b.V2L; + a.V2R /= b.V2R; + a.V3L /= b.V3L; + a.V3R /= b.V3R; + a.V4L /= b.V4L; + a.V4R /= b.V4R; + a.V5L /= b.V5L; + a.V5R /= b.V5R; + a.V6L /= b.V6L; + a.V6R /= b.V6R; + a.V7L /= b.V7L; + a.V7R /= b.V7R; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor) + { + var neg = new Vector4(-1); + var add = new Vector4(.5F); + + // sign(dividend) = max(min(dividend, 1), -1) + Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One); + + // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend) + // TODO: This is wrong but I have no idea how to fix it without if-else operator + // sign here is a value in range [-1..1], it can be equal to -0.2 for example which is wrong + return (dividend / divisor) + (sign * add); + } + public void RoundInto(ref Block8x8 dest) { for (int i = 0; i < Size; i++) From 2f143bf9d39703f37030823c45c5e200ebc46a12 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Thu, 9 Sep 2021 21:26:18 +0300 Subject: [PATCH 27/56] New FDCT method, reciprocal quantization --- .../Jpeg/Components/Block8x8F.Intrinsic.cs | 81 +++- .../Formats/Jpeg/Components/Block8x8F.cs | 209 +++------ .../Decoder/JpegBlockPostProcessor.cs | 2 +- .../Components/Encoder/HuffmanScanEncoder.cs | 34 +- .../FastFloatingPointDCT.Intrinsic.cs | 210 +++++++++ .../Jpeg/Components/FastFloatingPointDCT.cs | 400 ++++++------------ .../Jpeg/Components/ZigZag.Intrinsic.cs | 108 ++--- .../BlockOperations/Block8x8F_Transpose.cs | 8 +- .../Formats/Jpg/Block8x8FTests.cs | 50 +-- .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs | 149 ++----- 10 files changed, 599 insertions(+), 652 deletions(-) create mode 100644 src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs index 073580d40..83227ff07 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -3,6 +3,7 @@ #if SUPPORTS_RUNTIME_INTRINSICS using System; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; @@ -38,7 +39,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 0, 1, 4, 5, 2, 3, 6, 7 }; - private static unsafe void DivideIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); @@ -53,8 +54,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components for (int i = 0; i < 8; i += 2) { - Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); Vector256 row = Avx2.PackSignedSaturate(row0, row1); row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16(); @@ -64,7 +65,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } } - private static void DivideIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!"); @@ -75,13 +76,81 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components for (int i = 0; i < 16; i += 2) { - Vector128 left = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector128 right = Sse2.ConvertToVector128Int32(Sse.Divide(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + Vector128 left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector128 right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); Vector128 row = Sse2.PackSignedSaturate(left, right); Unsafe.Add(ref destBase, i / 2) = row; } } + + private void TransposeAvx() + { + // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 + Vector256 r0 = Avx.InsertVector128( + this.V0, + Unsafe.As>(ref this.V4L), + 1); + + Vector256 r1 = Avx.InsertVector128( + this.V1, + Unsafe.As>(ref this.V5L), + 1); + + Vector256 r2 = Avx.InsertVector128( + this.V2, + Unsafe.As>(ref this.V6L), + 1); + + Vector256 r3 = Avx.InsertVector128( + this.V3, + Unsafe.As>(ref this.V7L), + 1); + + Vector256 r4 = Avx.InsertVector128( + Unsafe.As>(ref this.V0R).ToVector256(), + Unsafe.As>(ref this.V4R), + 1); + + Vector256 r5 = Avx.InsertVector128( + Unsafe.As>(ref this.V1R).ToVector256(), + Unsafe.As>(ref this.V5R), + 1); + + Vector256 r6 = Avx.InsertVector128( + Unsafe.As>(ref this.V2R).ToVector256(), + Unsafe.As>(ref this.V6R), + 1); + + Vector256 r7 = Avx.InsertVector128( + Unsafe.As>(ref this.V3R).ToVector256(), + Unsafe.As>(ref this.V7R), + 1); + + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackLow(r2, r3); + Vector256 v = Avx.Shuffle(t0, t2, 0x4E); + this.V0 = Avx.Blend(t0, v, 0xCC); + this.V1 = Avx.Blend(t2, v, 0x33); + + Vector256 t4 = Avx.UnpackLow(r4, r5); + Vector256 t6 = Avx.UnpackLow(r6, r7); + v = Avx.Shuffle(t4, t6, 0x4E); + this.V4 = Avx.Blend(t4, v, 0xCC); + this.V5 = Avx.Blend(t6, v, 0x33); + + Vector256 t1 = Avx.UnpackHigh(r0, r1); + Vector256 t3 = Avx.UnpackHigh(r2, r3); + v = Avx.Shuffle(t1, t3, 0x4E); + this.V2 = Avx.Blend(t1, v, 0xCC); + this.V3 = Avx.Blend(t3, v, 0x33); + + Vector256 t5 = Avx.UnpackHigh(r4, r5); + Vector256 t7 = Avx.UnpackHigh(r6, r7); + v = Avx.Shuffle(t5, t7, 0x4E); + this.V6 = Avx.Blend(t5, v, 0xCC); + this.V7 = Avx.Blend(t7, v, 0x33); + } } } #endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 2656f07ca..0b7873585 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -413,41 +413,41 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported) { - DivideIntoInt16_Avx2(ref block, ref qt, ref dest); + MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest); ZigZag.ApplyZigZagOrderingAvx(ref dest, ref dest); } else if (Ssse3.IsSupported) { - DivideIntoInt16_Sse2(ref block, ref qt, ref dest); + MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest); ZigZag.ApplyZigZagOrderingSse(ref dest, ref dest); } else #endif { - Divide(ref block, ref qt); + Multiply(ref block, ref qt); block.RoundInto(ref dest); } } [MethodImpl(InliningOptions.ShortMethod)] - private static void Divide(ref Block8x8F a, ref Block8x8F b) - { - a.V0L /= b.V0L; - a.V0R /= b.V0R; - a.V1L /= b.V1L; - a.V1R /= b.V1R; - a.V2L /= b.V2L; - a.V2R /= b.V2R; - a.V3L /= b.V3L; - a.V3R /= b.V3R; - a.V4L /= b.V4L; - a.V4R /= b.V4R; - a.V5L /= b.V5L; - a.V5R /= b.V5R; - a.V6L /= b.V6L; - a.V6R /= b.V6R; - a.V7L /= b.V7L; - a.V7R /= b.V7R; + private static void Multiply(ref Block8x8F a, ref Block8x8F b) + { + a.V0L *= b.V0L; + a.V0R *= b.V0R; + a.V1L *= b.V1L; + a.V1R *= b.V1R; + a.V2L *= b.V2L; + a.V2R *= b.V2R; + a.V3L *= b.V3L; + a.V3R *= b.V3R; + a.V4L *= b.V4L; + a.V4R *= b.V4R; + a.V5L *= b.V5L; + a.V5R *= b.V5R; + a.V6L *= b.V6L; + a.V6R *= b.V6R; + a.V7L *= b.V7L; + a.V7R *= b.V7R; } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -608,154 +608,45 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } /// - /// Transpose the block into the destination block. + /// Transpose the block inplace. /// - /// The destination block [MethodImpl(InliningOptions.ShortMethod)] - public void TransposeInto(ref Block8x8F d) + public void Transpose() { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { - // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 - Vector256 r0 = Avx.InsertVector128( - Unsafe.As>(ref this.V0L).ToVector256(), - Unsafe.As>(ref this.V4L), - 1); - - Vector256 r1 = Avx.InsertVector128( - Unsafe.As>(ref this.V1L).ToVector256(), - Unsafe.As>(ref this.V5L), - 1); - - Vector256 r2 = Avx.InsertVector128( - Unsafe.As>(ref this.V2L).ToVector256(), - Unsafe.As>(ref this.V6L), - 1); - - Vector256 r3 = Avx.InsertVector128( - Unsafe.As>(ref this.V3L).ToVector256(), - Unsafe.As>(ref this.V7L), - 1); - - Vector256 r4 = Avx.InsertVector128( - Unsafe.As>(ref this.V0R).ToVector256(), - Unsafe.As>(ref this.V4R), - 1); - - Vector256 r5 = Avx.InsertVector128( - Unsafe.As>(ref this.V1R).ToVector256(), - Unsafe.As>(ref this.V5R), - 1); - - Vector256 r6 = Avx.InsertVector128( - Unsafe.As>(ref this.V2R).ToVector256(), - Unsafe.As>(ref this.V6R), - 1); - - Vector256 r7 = Avx.InsertVector128( - Unsafe.As>(ref this.V3R).ToVector256(), - Unsafe.As>(ref this.V7R), - 1); - - Vector256 t0 = Avx.UnpackLow(r0, r1); - Vector256 t2 = Avx.UnpackLow(r2, r3); - Vector256 v = Avx.Shuffle(t0, t2, 0x4E); - d.V0 = Avx.Blend(t0, v, 0xCC); - d.V1 = Avx.Blend(t2, v, 0x33); - - Vector256 t4 = Avx.UnpackLow(r4, r5); - Vector256 t6 = Avx.UnpackLow(r6, r7); - v = Avx.Shuffle(t4, t6, 0x4E); - d.V4 = Avx.Blend(t4, v, 0xCC); - d.V5 = Avx.Blend(t6, v, 0x33); - - Vector256 t1 = Avx.UnpackHigh(r0, r1); - Vector256 t3 = Avx.UnpackHigh(r2, r3); - v = Avx.Shuffle(t1, t3, 0x4E); - d.V2 = Avx.Blend(t1, v, 0xCC); - d.V3 = Avx.Blend(t3, v, 0x33); - - Vector256 t5 = Avx.UnpackHigh(r4, r5); - Vector256 t7 = Avx.UnpackHigh(r6, r7); - v = Avx.Shuffle(t5, t7, 0x4E); - d.V6 = Avx.Blend(t5, v, 0xCC); - d.V7 = Avx.Blend(t7, v, 0x33); + this.TransposeAvx(); } else #endif { - d.V0L.X = this.V0L.X; - d.V1L.X = this.V0L.Y; - d.V2L.X = this.V0L.Z; - d.V3L.X = this.V0L.W; - d.V4L.X = this.V0R.X; - d.V5L.X = this.V0R.Y; - d.V6L.X = this.V0R.Z; - d.V7L.X = this.V0R.W; - - d.V0L.Y = this.V1L.X; - d.V1L.Y = this.V1L.Y; - d.V2L.Y = this.V1L.Z; - d.V3L.Y = this.V1L.W; - d.V4L.Y = this.V1R.X; - d.V5L.Y = this.V1R.Y; - d.V6L.Y = this.V1R.Z; - d.V7L.Y = this.V1R.W; - - d.V0L.Z = this.V2L.X; - d.V1L.Z = this.V2L.Y; - d.V2L.Z = this.V2L.Z; - d.V3L.Z = this.V2L.W; - d.V4L.Z = this.V2R.X; - d.V5L.Z = this.V2R.Y; - d.V6L.Z = this.V2R.Z; - d.V7L.Z = this.V2R.W; - - d.V0L.W = this.V3L.X; - d.V1L.W = this.V3L.Y; - d.V2L.W = this.V3L.Z; - d.V3L.W = this.V3L.W; - d.V4L.W = this.V3R.X; - d.V5L.W = this.V3R.Y; - d.V6L.W = this.V3R.Z; - d.V7L.W = this.V3R.W; - - d.V0R.X = this.V4L.X; - d.V1R.X = this.V4L.Y; - d.V2R.X = this.V4L.Z; - d.V3R.X = this.V4L.W; - d.V4R.X = this.V4R.X; - d.V5R.X = this.V4R.Y; - d.V6R.X = this.V4R.Z; - d.V7R.X = this.V4R.W; - - d.V0R.Y = this.V5L.X; - d.V1R.Y = this.V5L.Y; - d.V2R.Y = this.V5L.Z; - d.V3R.Y = this.V5L.W; - d.V4R.Y = this.V5R.X; - d.V5R.Y = this.V5R.Y; - d.V6R.Y = this.V5R.Z; - d.V7R.Y = this.V5R.W; - - d.V0R.Z = this.V6L.X; - d.V1R.Z = this.V6L.Y; - d.V2R.Z = this.V6L.Z; - d.V3R.Z = this.V6L.W; - d.V4R.Z = this.V6R.X; - d.V5R.Z = this.V6R.Y; - d.V6R.Z = this.V6R.Z; - d.V7R.Z = this.V6R.W; - - d.V0R.W = this.V7L.X; - d.V1R.W = this.V7L.Y; - d.V2R.W = this.V7L.Z; - d.V3R.W = this.V7L.W; - d.V4R.W = this.V7R.X; - d.V5R.W = this.V7R.Y; - d.V6R.W = this.V7R.Z; - d.V7R.W = this.V7R.W; + this.TransposeScalar(); + } + } + + /// + /// Scalar inplace transpose implementation for + /// + [MethodImpl(InliningOptions.ShortMethod)] + private void TransposeScalar() + { + float tmp; + int horIndex, verIndex; + + // We don't care about the last row as it consists of a single element + // Which won't be swapped with anything + for (int i = 0; i < 7; i++) + { + // We don't care about the first element in each row as it's not swapped + for (int j = i + 1; j < 8; j++) + { + horIndex = (i * 8) + j; + verIndex = (j * 8) + i; + tmp = this[horIndex]; + this[horIndex] = this[verIndex]; + this[verIndex] = tmp; + } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs index cf5fdd2df..085cd4a29 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs @@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder // Dequantize: block.MultiplyInPlace(ref this.DequantiazationTable); - FastFloatingPointDCT.TransformInplaceIDCT(ref block, ref this.WorkspaceBlock); + FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock); // To conform better to libjpeg we actually NEED TO loose precision here. // This is because they store blocks as Int16 between all the operations. diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index db0bc32ae..da4723e21 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -94,8 +94,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// private int bitCount; - private Block8x8F temporalBlock; - private Block8x8 temporalShortBlock; + private Block8x8 tempBlock; /// /// The output stream. All attempted writes after the first error become no-ops. @@ -130,6 +129,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void Encode444(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { + // Calculate reciprocal quantization tables for FDCT method + for (int i = 0; i < 64; i++) + { + luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; + chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i]; + } + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming @@ -190,6 +196,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void Encode420(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { + // Calculate reciprocal quantization tables for FDCT method + for (int i = 0; i < 64; i++) + { + luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; + chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i]; + } + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming @@ -256,6 +269,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void EncodeGrayscale(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { + // Calculate reciprocal quantization tables for FDCT method + for (int i = 0; i < 64; i++) + { + luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; + } + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming @@ -301,6 +320,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void EncodeRgb(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { + // Calculate reciprocal quantization tables for FDCT method + for (int i = 0; i < 64; i++) + { + luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; + } + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming @@ -365,14 +390,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref Block8x8F block, ref Block8x8F quant) { - ref Block8x8F refTemp = ref this.temporalBlock; - ref Block8x8 spectralBlock = ref this.temporalShortBlock; + ref Block8x8 spectralBlock = ref this.tempBlock; // Shifting level from 0..255 to -128..127 block.AddInPlace(-128f); // Discrete cosine transform - FastFloatingPointDCT.TransformInplaceFDCT(ref block, ref refTemp); + FastFloatingPointDCT.TransformFDCT(ref block); // Quantization Block8x8F.Quantize(ref block, ref spectralBlock, ref quant); diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs new file mode 100644 index 000000000..eb60445d3 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -0,0 +1,210 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +#if SUPPORTS_RUNTIME_INTRINSICS +using System; +using System.Collections.Generic; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using System.Text; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components +{ + internal static partial class FastFloatingPointDCT + { + /// + /// Gets reciprocal coefficients for jpeg quantization tables calculation. + /// + /// + /// + /// Current FDCT implementation expects its results to be multiplied by + /// a reciprocal quantization table. Values in this table must be divided + /// by quantization table values scaled with quality settings. + /// + /// + /// These values were calculates with this formula: + /// + /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8; + /// + /// Where: + /// + /// scalefactor[0] = 1 + /// + /// + /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + /// + /// Values are also scaled by 8 so DCT code won't do unnecessary division. + /// + /// + public static ReadOnlySpan DctReciprocalAdjustmentCoefficients => new float[] + { + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, + 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, + 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, + 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, + 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, + }; + +#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings + private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f); + private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f); + private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f); + private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f); + + private static readonly Vector128 mm128_F_0_7071 = Vector128.Create(0.707106781f); + private static readonly Vector128 mm128_F_0_3826 = Vector128.Create(0.382683433f); + private static readonly Vector128 mm128_F_0_5411 = Vector128.Create(0.541196100f); + private static readonly Vector128 mm128_F_1_3065 = Vector128.Create(1.306562965f); +#pragma warning restore SA1310, SA1311, IDE1006 + + /// + /// Apply floating point FDCT inplace using simd operations. + /// + /// Input matrix. + private static void ForwardTransformSimd(ref Block8x8F block) + { + DebugGuard.IsTrue(Avx.IsSupported || Sse.IsSupported, "Avx or at least Sse support is required to execute this operation."); + + // First pass - process rows + block.Transpose(); + if (Avx.IsSupported) + { + FDCT8x8_avx(ref block); + } + else if (Sse.IsSupported) + { + // Left part + FDCT8x4_sse(ref Unsafe.As>(ref block.V0L)); + + // Right part + FDCT8x4_sse(ref Unsafe.As>(ref block.V0R)); + } + + // Second pass - process columns + block.Transpose(); + if (Avx.IsSupported) + { + FDCT8x8_avx(ref block); + } + else if (Sse.IsSupported) + { + // Left part + FDCT8x4_sse(ref Unsafe.As>(ref block.V0L)); + + // Right part + FDCT8x4_sse(ref Unsafe.As>(ref block.V0R)); + } + } + + /// + /// Apply 1D floating point FDCT inplace using SSE operations on 8x4 part of 8x8 matrix. + /// + /// + /// Requires Sse support. + /// Must be called on both 8x4 matrix parts for the full FDCT transform. + /// + /// Input reference to the first + public static void FDCT8x4_sse(ref Vector128 blockRef) + { + DebugGuard.IsTrue(Sse.IsSupported, "Sse support is required to execute this operation."); + + Vector128 tmp0 = Sse.Add(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14)); + Vector128 tmp7 = Sse.Subtract(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14)); + Vector128 tmp1 = Sse.Add(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12)); + Vector128 tmp6 = Sse.Subtract(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12)); + Vector128 tmp2 = Sse.Add(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10)); + Vector128 tmp5 = Sse.Subtract(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10)); + Vector128 tmp3 = Sse.Add(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8)); + Vector128 tmp4 = Sse.Subtract(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8)); + + // Even part + Vector128 tmp10 = Sse.Add(tmp0, tmp3); + Vector128 tmp13 = Sse.Subtract(tmp0, tmp3); + Vector128 tmp11 = Sse.Add(tmp1, tmp2); + Vector128 tmp12 = Sse.Subtract(tmp1, tmp2); + + Unsafe.Add(ref blockRef, 0) = Sse.Add(tmp10, tmp11); + Unsafe.Add(ref blockRef, 8) = Sse.Subtract(tmp10, tmp11); + + Vector128 z1 = Sse.Multiply(Sse.Add(tmp12, tmp13), mm128_F_0_7071); + Unsafe.Add(ref blockRef, 4) = Sse.Add(tmp13, z1); + Unsafe.Add(ref blockRef, 12) = Sse.Subtract(tmp13, z1); + + // Odd part + tmp10 = Sse.Add(tmp4, tmp5); + tmp11 = Sse.Add(tmp5, tmp6); + tmp12 = Sse.Add(tmp6, tmp7); + + Vector128 z5 = Sse.Multiply(Sse.Subtract(tmp10, tmp12), mm128_F_0_3826); + Vector128 z2 = Sse.Add(Sse.Multiply(mm128_F_0_5411, tmp10), z5); + Vector128 z4 = Sse.Add(Sse.Multiply(mm128_F_1_3065, tmp12), z5); + Vector128 z3 = Sse.Multiply(tmp11, mm128_F_0_7071); + + Vector128 z11 = Sse.Add(tmp7, z3); + Vector128 z13 = Sse.Subtract(tmp7, z3); + + Unsafe.Add(ref blockRef, 10) = Sse.Add(z13, z2); + Unsafe.Add(ref blockRef, 6) = Sse.Subtract(z13, z2); + Unsafe.Add(ref blockRef, 2) = Sse.Add(z11, z4); + Unsafe.Add(ref blockRef, 14) = Sse.Subtract(z11, z4); + } + + /// + /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix. + /// + /// + /// Requires Avx support. + /// + /// Input matrix. + public static void FDCT8x8_avx(ref Block8x8F block) + { + DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); + + Vector256 tmp0 = Avx.Add(block.V0, block.V7); + Vector256 tmp7 = Avx.Subtract(block.V0, block.V7); + Vector256 tmp1 = Avx.Add(block.V1, block.V6); + Vector256 tmp6 = Avx.Subtract(block.V1, block.V6); + Vector256 tmp2 = Avx.Add(block.V2, block.V5); + Vector256 tmp5 = Avx.Subtract(block.V2, block.V5); + Vector256 tmp3 = Avx.Add(block.V3, block.V4); + Vector256 tmp4 = Avx.Subtract(block.V3, block.V4); + + // Even part + Vector256 tmp10 = Avx.Add(tmp0, tmp3); + Vector256 tmp13 = Avx.Subtract(tmp0, tmp3); + Vector256 tmp11 = Avx.Add(tmp1, tmp2); + Vector256 tmp12 = Avx.Subtract(tmp1, tmp2); + + block.V0 = Avx.Add(tmp10, tmp11); + block.V4 = Avx.Subtract(tmp10, tmp11); + + Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); + block.V2 = Avx.Add(tmp13, z1); + block.V6 = Avx.Subtract(tmp13, z1); + + // Odd part + tmp10 = Avx.Add(tmp4, tmp5); + tmp11 = Avx.Add(tmp5, tmp6); + tmp12 = Avx.Add(tmp6, tmp7); + + Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); + Vector256 z2 = Avx.Add(Avx.Multiply(mm256_F_0_5411, tmp10), z5); + Vector256 z4 = Avx.Add(Avx.Multiply(mm256_F_1_3065, tmp12), z5); + Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071); + + Vector256 z11 = Avx.Add(tmp7, z3); + Vector256 z13 = Avx.Subtract(tmp7, z3); + + block.V5 = Avx.Add(z13, z2); + block.V3 = Avx.Subtract(z13, z2); + block.V1 = Avx.Add(z11, z4); + block.V7 = Avx.Subtract(z11, z4); + } + } +} +#endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index dd46a83e3..a554e8577 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -46,11 +46,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components #if SUPPORTS_RUNTIME_INTRINSICS private static readonly Vector256 C_V_0_5411 = Vector256.Create(0.541196f); - private static readonly Vector256 C_V_1_3065 = Vector256.Create(1.306563f); private static readonly Vector256 C_V_1_1758 = Vector256.Create(1.175876f); - private static readonly Vector256 C_V_0_7856 = Vector256.Create(0.785695f); - private static readonly Vector256 C_V_1_3870 = Vector256.Create(1.387040f); - private static readonly Vector256 C_V_0_2758 = Vector256.Create(0.275899f); private static readonly Vector256 C_V_n1_9615 = Vector256.Create(-1.961570560f); private static readonly Vector256 C_V_n0_3901 = Vector256.Create(-0.390180644f); @@ -62,250 +58,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private static readonly Vector256 C_V_1_5013 = Vector256.Create(1.501321110f); private static readonly Vector256 C_V_n1_8477 = Vector256.Create(-1.847759065f); private static readonly Vector256 C_V_0_7653 = Vector256.Create(0.765366865f); - - private static readonly Vector256 C_V_InvSqrt2 = Vector256.Create(0.707107f); #endif #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore - private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f); - - /// - /// Original: - /// - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15 - /// - /// - /// Source - /// Destination - public static void FDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 c0 = s.V0L; - Vector4 c1 = s.V7L; - Vector4 t0 = c0 + c1; - Vector4 t7 = c0 - c1; - - c1 = s.V6L; - c0 = s.V1L; - Vector4 t1 = c0 + c1; - Vector4 t6 = c0 - c1; - - c1 = s.V5L; - c0 = s.V2L; - Vector4 t2 = c0 + c1; - Vector4 t5 = c0 - c1; - - c0 = s.V3L; - c1 = s.V4L; - Vector4 t3 = c0 + c1; - Vector4 t4 = c0 - c1; - - c0 = t0 + t3; - Vector4 c3 = t0 - t3; - c1 = t1 + t2; - Vector4 c2 = t1 - t2; - - d.V0L = c0 + c1; - d.V4L = c0 - c1; - - float w0 = 0.541196f; - float w1 = 1.306563f; - - d.V2L = (w0 * c2) + (w1 * c3); - d.V6L = (w0 * c3) - (w1 * c2); - - w0 = 1.175876f; - w1 = 0.785695f; - c3 = (w0 * t4) + (w1 * t7); - c0 = (w0 * t7) - (w1 * t4); - - w0 = 1.387040f; - w1 = 0.275899f; - c2 = (w0 * t5) + (w1 * t6); - c1 = (w0 * t6) - (w1 * t5); - - d.V3L = c0 - c2; - d.V5L = c3 - c1; - - float invsqrt2 = 0.707107f; - c0 = (c0 + c2) * invsqrt2; - c3 = (c3 + c1) * invsqrt2; - - d.V1L = c0 + c3; - d.V7L = c0 - c3; - } - - /// - /// Original: - /// - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15 - /// - /// - /// Source - /// Destination - public static void FDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 c0 = s.V0R; - Vector4 c1 = s.V7R; - Vector4 t0 = c0 + c1; - Vector4 t7 = c0 - c1; - - c1 = s.V6R; - c0 = s.V1R; - Vector4 t1 = c0 + c1; - Vector4 t6 = c0 - c1; - - c1 = s.V5R; - c0 = s.V2R; - Vector4 t2 = c0 + c1; - Vector4 t5 = c0 - c1; - - c0 = s.V3R; - c1 = s.V4R; - Vector4 t3 = c0 + c1; - Vector4 t4 = c0 - c1; - - c0 = t0 + t3; - Vector4 c3 = t0 - t3; - c1 = t1 + t2; - Vector4 c2 = t1 - t2; - - d.V0R = c0 + c1; - d.V4R = c0 - c1; - - float w0 = 0.541196f; - float w1 = 1.306563f; - - d.V2R = (w0 * c2) + (w1 * c3); - d.V6R = (w0 * c3) - (w1 * c2); - - w0 = 1.175876f; - w1 = 0.785695f; - c3 = (w0 * t4) + (w1 * t7); - c0 = (w0 * t7) - (w1 * t4); - - w0 = 1.387040f; - w1 = 0.275899f; - c2 = (w0 * t5) + (w1 * t6); - c1 = (w0 * t6) - (w1 * t5); - - d.V3R = c0 - c2; - d.V5R = c3 - c1; - - c0 = (c0 + c2) * InvSqrt2; - c3 = (c3 + c1) * InvSqrt2; - - d.V1R = c0 + c3; - d.V7R = c0 - c3; - } - - /// - /// Combined operation of and - /// using AVX commands. - /// - /// Source - /// Destination - public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); - - Vector256 t0 = Avx.Add(s.V0, s.V7); - Vector256 t7 = Avx.Subtract(s.V0, s.V7); - Vector256 t1 = Avx.Add(s.V1, s.V6); - Vector256 t6 = Avx.Subtract(s.V1, s.V6); - Vector256 t2 = Avx.Add(s.V2, s.V5); - Vector256 t5 = Avx.Subtract(s.V2, s.V5); - Vector256 t3 = Avx.Add(s.V3, s.V4); - Vector256 t4 = Avx.Subtract(s.V3, s.V4); - - Vector256 c0 = Avx.Add(t0, t3); - Vector256 c1 = Avx.Add(t1, t2); - - // 0 4 - d.V0 = Avx.Add(c0, c1); - d.V4 = Avx.Subtract(c0, c1); - - Vector256 c3 = Avx.Subtract(t0, t3); - Vector256 c2 = Avx.Subtract(t1, t2); - - // 2 6 - d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065); - d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411); - - c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856); - c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758); - - c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6); - c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870); - - // 3 5 - d.V3 = Avx.Subtract(c0, c2); - d.V5 = Avx.Subtract(c3, c1); - - c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2); - c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2); - - // 1 7 - d.V1 = Avx.Add(c0, c3); - d.V7 = Avx.Subtract(c0, c3); -#endif - } - - /// - /// Performs 8x8 matrix Forward Discrete Cosine Transform - /// - /// Source - /// Destination - public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) - { - FDCT8x8_Avx(ref s, ref d); - } - else -#endif - { - FDCT8x4_LeftPart(ref s, ref d); - FDCT8x4_RightPart(ref s, ref d); - } - } - - /// - /// Apply floating point FDCT from src into dest - /// - /// Source - /// Destination - /// Temporary block provided by the caller for optimization - public static void TransformFDCT( - ref Block8x8F src, - ref Block8x8F dest, - ref Block8x8F temp) - { - src.TransposeInto(ref temp); - FDCT8x8(ref temp, ref dest); - - dest.TransposeInto(ref temp); - FDCT8x8(ref temp, ref dest); - - dest.MultiplyInPlace(C_0_125); - } - - /// - /// Apply floating point FDCT inplace. - /// - /// Input matrix. - /// Matrix to store temporal results. - public static void TransformInplaceFDCT(ref Block8x8F matrix, ref Block8x8F temp) - { - matrix.TransposeInto(ref temp); - FDCT8x8(ref temp, ref matrix); - - matrix.TransposeInto(ref temp); - FDCT8x8(ref temp, ref matrix); - - matrix.MultiplyInPlace(C_0_125); - } - /// /// Performs 8x8 matrix Inverse Discrete Cosine Transform /// @@ -501,40 +255,148 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } /// - /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization). - /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239 + /// Apply floating point IDCT inplace. + /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. /// - /// Source - /// Destination - /// Temporary block provided by the caller - public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp) + /// Input matrix. + /// Matrix to store temporal results. + public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp) { - src.TransposeInto(ref temp); - - IDCT8x8(ref temp, ref dest); - dest.TransposeInto(ref temp); - IDCT8x8(ref temp, ref dest); + block.Transpose(); + IDCT8x8(ref block, ref temp); + temp.Transpose(); + IDCT8x8(ref temp, ref block); // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? - dest.MultiplyInPlace(C_0_125); + block.MultiplyInPlace(C_0_125); } /// - /// Apply floating point IDCT inplace. - /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. + /// Apply 2D floating point FDCT inplace using scalar operations. /// - /// Input matrix. - /// Matrix to store temporal results. - public static void TransformInplaceIDCT(ref Block8x8F block, ref Block8x8F temp) + /// + /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c. + /// + /// Input matrix. + private static void ForwardTransformScalar(ref Block8x8F block) { - block.TransposeInto(ref temp); + const int dctSize = 8; - IDCT8x8(ref temp, ref block); - block.TransposeInto(ref temp); - IDCT8x8(ref temp, ref block); + float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + float tmp10, tmp11, tmp12, tmp13; + float z1, z2, z3, z4, z5, z11, z13; - // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? - block.MultiplyInPlace(C_0_125); + // First pass - process rows + ref float dataRef = ref Unsafe.As(ref block); + for (int ctr = 7; ctr >= 0; ctr--) + { + tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7); + tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7); + tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6); + tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6); + tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5); + tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5); + tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4); + tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4); + + // Even part + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11; + Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11; + + z1 = (tmp12 + tmp13) * 0.707106781f; + Unsafe.Add(ref dataRef, 2) = tmp13 + z1; + Unsafe.Add(ref dataRef, 6) = tmp13 - z1; + + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = (tmp10 - tmp12) * 0.382683433f; + z2 = (0.541196100f * tmp10) + z5; + z4 = (1.306562965f * tmp12) + z5; + z3 = tmp11 * 0.707106781f; + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + Unsafe.Add(ref dataRef, 5) = z13 + z2; + Unsafe.Add(ref dataRef, 3) = z13 - z2; + Unsafe.Add(ref dataRef, 1) = z11 + z4; + Unsafe.Add(ref dataRef, 7) = z11 - z4; + + dataRef = ref Unsafe.Add(ref dataRef, dctSize); + } + + // Second pass - process columns + dataRef = ref Unsafe.As(ref block); + for (int ctr = 7; ctr >= 0; ctr--) + { + tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7); + tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7); + tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6); + tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6); + tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5); + tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5); + tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4); + tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4); + + // Even part + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11; + Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11; + + z1 = (tmp12 + tmp13) * 0.707106781f; + Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1; + Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1; + + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = (tmp10 - tmp12) * 0.382683433f; + z2 = (0.541196100f * tmp10) + z5; + z4 = (1.306562965f * tmp12) + z5; + z3 = tmp11 * 0.707106781f; + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2; + Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2; + Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4; + Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4; + + dataRef = ref Unsafe.Add(ref dataRef, 1); + } + } + + /// + /// Apply 2D floating point FDCT inplace. + /// + /// Input matrix. + public static void TransformFDCT(ref Block8x8F block) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported || Sse.IsSupported) + { + ForwardTransformSimd(ref block); + } + else +#endif + { + ForwardTransformScalar(ref block); + } } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs index 066eb2846..878a67b50 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -10,10 +10,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { internal static partial class ZigZag { +#pragma warning disable SA1309 // naming rules violation warnings /// /// Special byte value to zero out elements during Sse/Avx shuffle intrinsics. /// - private const byte Z = 0xff; + private const byte _ = 0xff; +#pragma warning restore SA1309 /// /// Gets shuffle vectors for @@ -22,82 +24,82 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private static ReadOnlySpan SseShuffleMasks => new byte[] { // 0_A - 0, 1, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, 6, 7, Z, Z, + 0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _, // 0_B - Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5, + _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5, // 0_C - Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, // 1_A - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, 10, 11, + _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11, // 1_B - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _, // 1_C - 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, + 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _, // 1_D - Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _, // 1_E - Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, // 2_B - 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + 8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 2_C - Z, Z, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, // 2_D - Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _, // 2_E - Z, Z, Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, Z, Z, 4, 5, + _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, // 2_F - Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, 2, 3, Z, Z, + _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, // 2_G - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, // 3_A - Z, Z, Z, Z, Z, Z, 12, 13, 14, 15, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _, // 3_B - Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z, + _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _, // 3_C - Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, + _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, // 3_D/4_E - 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, + 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, // 4_F - Z, Z, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, Z, Z, + _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _, // 4_G - Z, Z, Z, Z, 2, 3, Z, Z, Z, Z, 4, 5, Z, Z, Z, Z, + _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, // 4_H - Z, Z, Z, Z, Z, Z, 0, 1, 2, 3, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _, // 5_B - Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _, // 5_C - Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, // 5_D - 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, Z, Z, Z, Z, + 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _, // 5_E - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, // 5_F - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 8, 9, Z, Z, + _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _, // 5_G - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, + _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, // 6_D - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, // 6_E - Z, Z, Z, Z, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, + _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _, // 6_F - Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, + _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13, // 6_G - Z, Z, Z, Z, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _, // 6_H - 4, 5, 6, 7, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + 4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, // 7_F - Z, Z, Z, Z, Z, Z, Z, Z, 14, 15, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _, // 7_G - 10, 11, Z, Z, Z, Z, 12, 13, Z, Z, 14, 15, Z, Z, Z, Z, + 10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, // 7_H - Z, Z, 8, 9, 10, 11, Z, Z, Z, Z, Z, Z, 12, 13, 14, 15 + _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15 }; /// @@ -110,55 +112,55 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, // 01_AB - inner-lane - 0, 1, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, 6, 7, 12, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, 6, 7, + 0, 1, 2, 3, 8, 9, _, _, 10, 11, 4, 5, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, 6, 7, // 01_CD/23_GH - cross-lane - 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, Z, Z, Z, Z, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, Z, Z, Z, Z, + 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, // 01_CD - inner-lane - Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, // 01_EF - inner-lane - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, // 23_AB/45_CD/67_EF - cross-lane - 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, Z, Z, Z, Z, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, Z, Z, Z, Z, + 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, // 23_AB - inner-lane - 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, 2, 3, 8, 9, Z, Z, Z, Z, + 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, 2, 3, 8, 9, _, _, _, _, // 23_CD - inner-lane - Z, Z, 6, 7, 12, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 12, 13, + _, _, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, 6, 7, 12, 13, // 23_EF - inner-lane - Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 23_GH - inner-lane - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 1, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 45_AB - inner-lane - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, // 45_CD - inner-lane - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, Z, Z, 2, 3, 8, 9, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, // 45_EF - cross-lane - 1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, Z, Z, Z, Z, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, + 1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, _, _, _, _, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, // 45_EF - inner-lane - 2, 3, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, Z, Z, + 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, // 45_GH - inner-lane - Z, Z, Z, Z, 2, 3, 8, 9, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 6, 7, + _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, // 67_CD - inner-lane - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 67_EF - inner-lane - Z, Z, Z, Z, Z, Z, 6, 7, 0, 1, Z, Z, 2, 3, 8, 9, Z, Z, Z, Z, Z, Z, Z, Z, 10, 11, Z, Z, Z, Z, Z, Z, + _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, // 67_GH - inner-lane - 8, 9, 10, 11, 4, 5, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 3, 8, 9, 10, 11, 4, 5, Z, Z, 6, 7, 12, 13, 14, 15 + 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, 6, 7, 12, 13, 14, 15 }; /// diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs index 1d103cd1a..8e8787475 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -12,15 +12,11 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations private static readonly Block8x8F Source = Create8x8FloatData(); [Benchmark] - public void TransposeInto() - { - var dest = default(Block8x8F); - Source.TransposeInto(ref dest); - } + public void TransposeInto() => Source.Transpose(); private static Block8x8F Create8x8FloatData() { - var result = new float[64]; + float[] result = new float[64]; for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index fc642dcc7..89ef74d8b 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -164,52 +164,27 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg } [Fact] - public void TransposeInto() + public void Transpose() { static void RunTest() { float[] expected = Create8x8FloatData(); ReferenceImplementations.Transpose8x8(expected); - var source = default(Block8x8F); - source.LoadFrom(Create8x8FloatData()); + var block8x8 = default(Block8x8F); + block8x8.LoadFrom(Create8x8FloatData()); - var dest = default(Block8x8F); - source.TransposeInto(ref dest); + block8x8.Transpose(); float[] actual = new float[64]; - dest.ScaledCopyTo(actual); + block8x8.ScaledCopyTo(actual); Assert.Equal(expected, actual); } FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX); - } - - private class BufferHolder - { - public Block8x8F Buffer; - } - - [Fact] - public void TransposeInto_Benchmark() - { - var source = new BufferHolder(); - source.Buffer.LoadFrom(Create8x8FloatData()); - var dest = new BufferHolder(); - - this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark X {Times} ..."); - var sw = Stopwatch.StartNew(); - - for (int i = 0; i < Times; i++) - { - source.Buffer.TransposeInto(ref dest.Buffer); - } - - sw.Stop(); - this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark finished in {sw.ElapsedMilliseconds} ms"); + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic); } private static float[] Create8x8ColorCropTestData() @@ -281,16 +256,21 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed); Block8x8F quant = CreateRandomFloatBlock(-2000, 2000, qtSeed); + // Reference implementation quantizes given block via division Block8x8 expected = default; ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder); + // Actual current implementation quantizes given block via multiplication + // With quantization table reciprocal + for (int i = 0; i < Block8x8F.Size; i++) + { + quant[i] = 1f / quant[i]; + } + Block8x8 actual = default; Block8x8F.Quantize(ref source, ref actual, ref quant); - for (int i = 0; i < Block8x8.Size; i++) - { - Assert.Equal(expected[i], actual[i]); - } + this.CompareBlocks(expected, actual, 1); } [Fact] diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs index 34ca7f9eb..55d208c5a 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs @@ -2,6 +2,9 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics.X86; #endif @@ -33,15 +36,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); - var source = Block8x8F.Load(sourceArray); + var srcBlock = Block8x8F.Load(sourceArray); - Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source); + Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock); var temp = default(Block8x8F); - var actual = default(Block8x8F); - FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp); + FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp); - this.CompareBlocks(expected, actual, 1f); + this.CompareBlocks(expected, srcBlock, 1f); } [Theory] @@ -52,15 +54,14 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); - var source = Block8x8F.Load(sourceArray); + var srcBlock = Block8x8F.Load(sourceArray); - Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source); + Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock); var temp = default(Block8x8F); - var actual = default(Block8x8F); - FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp); + FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp); - this.CompareBlocks(expected, actual, 1f); + this.CompareBlocks(expected, srcBlock, 1f); } // Inverse transform @@ -167,8 +168,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg var srcBlock = default(Block8x8F); srcBlock.LoadFrom(src); - var destBlock = default(Block8x8F); - var expectedDest = new float[64]; var temp1 = new float[64]; var temp2 = default(Block8x8F); @@ -177,10 +176,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1); // testee - FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2); + FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2); var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); + srcBlock.ScaledCopyTo(actualDest); Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); } @@ -198,95 +197,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg } // Forward transform - [Theory] - [InlineData(1)] - [InlineData(2)] - public void FDCT8x4_LeftPart(int seed) - { - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); - - var expectedDest = new float[64]; - - // reference - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest); - - // testee - FastFloatingPointDCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock); - - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void FDCT8x4_RightPart(int seed) - { - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); - - var expectedDest = new float[64]; - - // reference - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); - - // testee - FastFloatingPointDCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock); - - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void FDCT8x8_Avx(int seed) - { -#if SUPPORTS_RUNTIME_INTRINSICS - var skip = !Avx.IsSupported; -#else - var skip = true; -#endif - if (skip) - { - this.Output.WriteLine("No AVX present, skipping test!"); - return; - } - - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); - - var expectedDest = new float[64]; - - // reference, left part - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest); - - // reference, right part - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); - - // testee, whole 8x8 - FastFloatingPointDCT.FDCT8x8_Avx(ref srcBlock, ref destBlock); - - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); - } - + // This test covers entire FDCT conversions chain + // This test checks all implementations: intrinsic and scalar fallback [Theory] [InlineData(1)] [InlineData(2)] @@ -297,37 +209,38 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int seed = FeatureTestRunner.Deserialize(serialized); Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); + var block = default(Block8x8F); + block.LoadFrom(src); - var expectedDest = new float[64]; - var temp1 = new float[64]; - var temp2 = default(Block8x8F); + float[] expectedDest = new float[64]; + float[] temp1 = new float[64]; // reference ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true); // testee - FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2); + // Part of the FDCT calculations is fused into the quantization step + // We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen + FastFloatingPointDCT.TransformFDCT(ref block); + for (int i = 0; i < 64; i++) + { + block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i]; + } - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); + float[] actualDest = block.ToArray(); - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); + Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f)); } // 3 paths: // 1. AllowAll - call avx/fma implementation // 2. DisableFMA - call avx implementation without fma acceleration - // 3. DisableAvx - call fallback code of Vector4 implementation - // - // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result + // 3. DisableAvx - call sse implementation + // 4. DisableHWIntrinsic - call scalar fallback implementation FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, seed, - HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX); + HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic); } } } From fb038aaf3c6af75ecedecee38ab11dedc2655881 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Fri, 10 Sep 2021 06:42:03 +0300 Subject: [PATCH 28/56] Tidied up DCT code --- .../Formats/Jpeg/Components/Block8x8F.cs | 109 ++++---- .../FastFloatingPointDCT.Intrinsic.cs | 230 +++++++++++++++- .../Jpeg/Components/FastFloatingPointDCT.cs | 247 ++---------------- 3 files changed, 284 insertions(+), 302 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 0b7873585..a25c572ae 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -450,21 +450,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components a.V7R *= b.V7R; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor) - { - var neg = new Vector4(-1); - var add = new Vector4(.5F); - - // sign(dividend) = max(min(dividend, 1), -1) - Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One); - - // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend) - // TODO: This is wrong but I have no idea how to fix it without if-else operator - // sign here is a value in range [-1..1], it can be equal to -0.2 for example which is wrong - return (dividend / divisor) + (sign * add); - } - public void RoundInto(ref Block8x8 dest) { for (int i = 0; i < Size; i++) @@ -562,6 +547,47 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components Unsafe.Add(ref dRef, 7) = bottom; } + /// + /// Compares entire 8x8 block to a single scalar value. + /// + /// Value to compare to. + public bool EqualsToScalar(int value) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); + + var targetVector = Vector256.Create(value); + ref Vector256 blockStride = ref this.V0; + + for (int i = 0; i < RowCount; i++) + { + Vector256 areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector); + if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask) + { + return false; + } + } + + return true; + } +#endif + { + ref float scalars = ref Unsafe.As(ref this); + + for (int i = 0; i < Size; i++) + { + if ((int)Unsafe.Add(ref scalars, i) != value) + { + return false; + } + } + + return true; + } + } + /// public bool Equals(Block8x8F other) => this.V0L == other.V0L @@ -598,15 +624,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components return sb.ToString(); } - [MethodImpl(InliningOptions.ShortMethod)] - private static Vector NormalizeAndRound(Vector row, Vector off, Vector max) - { - row += off; - row = Vector.Max(row, Vector.Zero); - row = Vector.Min(row, max); - return row.FastRound(); - } - /// /// Transpose the block inplace. /// @@ -650,45 +667,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } } - /// - /// Compares entire 8x8 block to a single scalar value. - /// - /// Value to compare to. - public bool EqualsToScalar(int value) + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector NormalizeAndRound(Vector row, Vector off, Vector max) { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) - { - const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); - - var targetVector = Vector256.Create(value); - ref Vector256 blockStride = ref this.V0; - - for (int i = 0; i < RowCount; i++) - { - Vector256 areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector); - if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask) - { - return false; - } - } - - return true; - } -#endif - { - ref float scalars = ref Unsafe.As(ref this); - - for (int i = 0; i < Size; i++) - { - if ((int)Unsafe.Add(ref scalars, i) != value) - { - return false; - } - } - - return true; - } + row += off; + row = Vector.Max(row, Vector.Zero); + row = Vector.Min(row, max); + return row.FastRound(); } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index eb60445d3..acc83e279 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -14,6 +14,30 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { internal static partial class FastFloatingPointDCT { +#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings + private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f); + private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f); + private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f); + private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f); + + private static readonly Vector128 mm128_F_0_7071 = Vector128.Create(0.707106781f); + private static readonly Vector128 mm128_F_0_3826 = Vector128.Create(0.382683433f); + private static readonly Vector128 mm128_F_0_5411 = Vector128.Create(0.541196100f); + private static readonly Vector128 mm128_F_1_3065 = Vector128.Create(1.306562965f); + + private static readonly Vector256 mm256_F_1_1758 = Vector256.Create(1.175876f); + private static readonly Vector256 mm256_F_n1_9615 = Vector256.Create(-1.961570560f); + private static readonly Vector256 mm256_F_n0_3901 = Vector256.Create(-0.390180644f); + private static readonly Vector256 mm256_F_n0_8999 = Vector256.Create(-0.899976223f); + private static readonly Vector256 mm256_F_n2_5629 = Vector256.Create(-2.562915447f); + private static readonly Vector256 mm256_F_0_2986 = Vector256.Create(0.298631336f); + private static readonly Vector256 mm256_F_2_0531 = Vector256.Create(2.053119869f); + private static readonly Vector256 mm256_F_3_0727 = Vector256.Create(3.072711026f); + private static readonly Vector256 mm256_F_1_5013 = Vector256.Create(1.501321110f); + private static readonly Vector256 mm256_F_n1_8477 = Vector256.Create(-1.847759065f); + private static readonly Vector256 mm256_F_0_7653 = Vector256.Create(0.765366865f); +#pragma warning restore SA1310, SA1311, IDE1006 + /// /// Gets reciprocal coefficients for jpeg quantization tables calculation. /// @@ -50,18 +74,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, }; -#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings - private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f); - private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f); - private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f); - private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f); - - private static readonly Vector128 mm128_F_0_7071 = Vector128.Create(0.707106781f); - private static readonly Vector128 mm128_F_0_3826 = Vector128.Create(0.382683433f); - private static readonly Vector128 mm128_F_0_5411 = Vector128.Create(0.541196100f); - private static readonly Vector128 mm128_F_1_3065 = Vector128.Create(1.306562965f); -#pragma warning restore SA1310, SA1311, IDE1006 - /// /// Apply floating point FDCT inplace using simd operations. /// @@ -205,6 +217,200 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components block.V1 = Avx.Add(z11, z4); block.V7 = Avx.Subtract(z11, z4); } + + /// + /// Performs 8x8 matrix Inverse Discrete Cosine Transform + /// + /// Source + /// Destination + public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported) + { + IDCT8x8_Avx(ref s, ref d); + } + else +#endif + { + IDCT8x4_LeftPart(ref s, ref d); + IDCT8x4_RightPart(ref s, ref d); + } + } + + /// + /// Do IDCT internal operations on the left part of the block. Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// Destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1L; + Vector4 my7 = s.V7L; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3L; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5L; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2L; + Vector4 my6 = s.V6L; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0L; + Vector4 my4 = s.V4L; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0L = my0 + mb0; + d.V7L = my0 - mb0; + d.V1L = my1 + mb1; + d.V6L = my1 - mb1; + d.V2L = my2 + mb2; + d.V5L = my2 - mb2; + d.V3L = my3 + mb3; + d.V4L = my3 - mb3; + } + + /// + /// Do IDCT internal operations on the right part of the block. + /// Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// The destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1R; + Vector4 my7 = s.V7R; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3R; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5R; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2R; + Vector4 my6 = s.V6R; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0R; + Vector4 my4 = s.V4R; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0R = my0 + mb0; + d.V7R = my0 - mb0; + d.V1R = my1 + mb1; + d.V6R = my1 - mb1; + d.V2R = my2 + mb2; + d.V5R = my2 - mb2; + d.V3R = my3 + mb3; + d.V4R = my3 - mb3; + } + + /// + /// Combined operation of and + /// using AVX commands. + /// + /// Source + /// Destination + public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); + + Vector256 my1 = s.V1; + Vector256 my7 = s.V7; + Vector256 mz0 = Avx.Add(my1, my7); + + Vector256 my3 = s.V3; + Vector256 mz2 = Avx.Add(my3, my7); + Vector256 my5 = s.V5; + Vector256 mz1 = Avx.Add(my3, my5); + Vector256 mz3 = Avx.Add(my1, my5); + + Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758); + + mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615); + mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901); + mz0 = Avx.Multiply(mz0, mm256_F_n0_8999); + mz1 = Avx.Multiply(mz1, mm256_F_n2_5629); + + Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2); + Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3); + Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2); + Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3); + + Vector256 my2 = s.V2; + Vector256 my6 = s.V6; + mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411); + Vector256 my0 = s.V0; + Vector256 my4 = s.V4; + mz0 = Avx.Add(my0, my4); + mz1 = Avx.Subtract(my0, my4); + mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477); + mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653); + + my0 = Avx.Add(mz0, mz3); + my3 = Avx.Subtract(mz0, mz3); + my1 = Avx.Add(mz1, mz2); + my2 = Avx.Subtract(mz1, mz2); + + d.V0 = Avx.Add(my0, mb0); + d.V7 = Avx.Subtract(my0, mb0); + d.V1 = Avx.Add(my1, mb1); + d.V6 = Avx.Subtract(my1, mb1); + d.V2 = Avx.Add(my2, mb2); + d.V5 = Avx.Subtract(my2, mb2); + d.V3 = Avx.Add(my3, mb3); + d.V4 = Avx.Subtract(my3, mb3); +#endif + } } } #endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index a554e8577..181f18185 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -43,216 +43,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private const float C_0_765367 = 0.765366865f; private const float C_0_125 = 0.1250f; - -#if SUPPORTS_RUNTIME_INTRINSICS - private static readonly Vector256 C_V_0_5411 = Vector256.Create(0.541196f); - private static readonly Vector256 C_V_1_1758 = Vector256.Create(1.175876f); - - private static readonly Vector256 C_V_n1_9615 = Vector256.Create(-1.961570560f); - private static readonly Vector256 C_V_n0_3901 = Vector256.Create(-0.390180644f); - private static readonly Vector256 C_V_n0_8999 = Vector256.Create(-0.899976223f); - private static readonly Vector256 C_V_n2_5629 = Vector256.Create(-2.562915447f); - private static readonly Vector256 C_V_0_2986 = Vector256.Create(0.298631336f); - private static readonly Vector256 C_V_2_0531 = Vector256.Create(2.053119869f); - private static readonly Vector256 C_V_3_0727 = Vector256.Create(3.072711026f); - private static readonly Vector256 C_V_1_5013 = Vector256.Create(1.501321110f); - private static readonly Vector256 C_V_n1_8477 = Vector256.Create(-1.847759065f); - private static readonly Vector256 C_V_0_7653 = Vector256.Create(0.765366865f); -#endif #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore - /// - /// Performs 8x8 matrix Inverse Discrete Cosine Transform - /// - /// Source - /// Destination - public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) - { - IDCT8x8_Avx(ref s, ref d); - } - else -#endif - { - IDCT8x4_LeftPart(ref s, ref d); - IDCT8x4_RightPart(ref s, ref d); - } - } - - /// - /// Do IDCT internal operations on the left part of the block. Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// Destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1L; - Vector4 my7 = s.V7L; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3L; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5L; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2L; - Vector4 my6 = s.V6L; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0L; - Vector4 my4 = s.V4L; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0L = my0 + mb0; - d.V7L = my0 - mb0; - d.V1L = my1 + mb1; - d.V6L = my1 - mb1; - d.V2L = my2 + mb2; - d.V5L = my2 - mb2; - d.V3L = my3 + mb3; - d.V4L = my3 - mb3; - } - - /// - /// Do IDCT internal operations on the right part of the block. - /// Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// The destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1R; - Vector4 my7 = s.V7R; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3R; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5R; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2R; - Vector4 my6 = s.V6R; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0R; - Vector4 my4 = s.V4R; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0R = my0 + mb0; - d.V7R = my0 - mb0; - d.V1R = my1 + mb1; - d.V6R = my1 - mb1; - d.V2R = my2 + mb2; - d.V5R = my2 - mb2; - d.V3R = my3 + mb3; - d.V4R = my3 - mb3; - } - - /// - /// Combined operation of and - /// using AVX commands. - /// - /// Source - /// Destination - public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); - - Vector256 my1 = s.V1; - Vector256 my7 = s.V7; - Vector256 mz0 = Avx.Add(my1, my7); - - Vector256 my3 = s.V3; - Vector256 mz2 = Avx.Add(my3, my7); - Vector256 my5 = s.V5; - Vector256 mz1 = Avx.Add(my3, my5); - Vector256 mz3 = Avx.Add(my1, my5); - - Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758); - - mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615); - mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901); - mz0 = Avx.Multiply(mz0, C_V_n0_8999); - mz1 = Avx.Multiply(mz1, C_V_n2_5629); - - Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2); - Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3); - Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2); - Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3); - - Vector256 my2 = s.V2; - Vector256 my6 = s.V6; - mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411); - Vector256 my0 = s.V0; - Vector256 my4 = s.V4; - mz0 = Avx.Add(my0, my4); - mz1 = Avx.Subtract(my0, my4); - mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477); - mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653); - - my0 = Avx.Add(mz0, mz3); - my3 = Avx.Subtract(mz0, mz3); - my1 = Avx.Add(mz1, mz2); - my2 = Avx.Subtract(mz1, mz2); - - d.V0 = Avx.Add(my0, mb0); - d.V7 = Avx.Subtract(my0, mb0); - d.V1 = Avx.Add(my1, mb1); - d.V6 = Avx.Subtract(my1, mb1); - d.V2 = Avx.Add(my2, mb2); - d.V5 = Avx.Subtract(my2, mb2); - d.V3 = Avx.Add(my3, mb3); - d.V4 = Avx.Subtract(my3, mb3); -#endif - } /// /// Apply floating point IDCT inplace. @@ -267,10 +58,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components temp.Transpose(); IDCT8x8(ref temp, ref block); - // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? + // TODO: This can be fused into quantization table step block.MultiplyInPlace(C_0_125); } + /// + /// Apply 2D floating point FDCT inplace. + /// + /// Input matrix. + public static void TransformFDCT(ref Block8x8F block) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported || Sse.IsSupported) + { + ForwardTransformSimd(ref block); + } + else +#endif + { + ForwardTransformScalar(ref block); + } + } + /// /// Apply 2D floating point FDCT inplace using scalar operations. /// @@ -380,23 +189,5 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components dataRef = ref Unsafe.Add(ref dataRef, 1); } } - - /// - /// Apply 2D floating point FDCT inplace. - /// - /// Input matrix. - public static void TransformFDCT(ref Block8x8F block) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported || Sse.IsSupported) - { - ForwardTransformSimd(ref block); - } - else -#endif - { - ForwardTransformScalar(ref block); - } - } } } From 9973e8da3b531f272f6079054e69d80303494ea7 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Fri, 10 Sep 2021 09:01:41 +0300 Subject: [PATCH 29/56] Removed excess code, added benchmarks --- .../Formats/Jpeg/Components/Block8x8F.cs | 30 +++++-------------- .../FastFloatingPointDCT.Intrinsic.cs | 3 +- .../Jpeg/Components/FastFloatingPointDCT.cs | 3 -- .../BlockOperations/Block8x8F_Quantize.cs | 23 ++++++++++++++ .../BlockOperations/Block8x8F_Transpose.cs | 16 +++++----- tests/ImageSharp.Benchmarks/Program.cs | 11 +++---- 6 files changed, 44 insertions(+), 42 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index a25c572ae..d93375f39 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -424,32 +424,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components else #endif { - Multiply(ref block, ref qt); - block.RoundInto(ref dest); + for (int i = 0; i < Size; i++) + { + int idx = ZigZag.ZigZagOrder[i]; + float quantizedVal = block[idx] * qt[idx]; + quantizedVal += quantizedVal < 0 ? -0.5f : 0.5f; + dest[i] = (short)quantizedVal; + } } } - [MethodImpl(InliningOptions.ShortMethod)] - private static void Multiply(ref Block8x8F a, ref Block8x8F b) - { - a.V0L *= b.V0L; - a.V0R *= b.V0R; - a.V1L *= b.V1L; - a.V1R *= b.V1R; - a.V2L *= b.V2L; - a.V2R *= b.V2R; - a.V3L *= b.V3L; - a.V3R *= b.V3R; - a.V4L *= b.V4L; - a.V4R *= b.V4R; - a.V5L *= b.V5L; - a.V5R *= b.V5R; - a.V6L *= b.V6L; - a.V6R *= b.V6R; - a.V7L *= b.V7L; - a.V7R *= b.V7R; - } - public void RoundInto(ref Block8x8 dest) { for (int i = 0; i < Size; i++) diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index acc83e279..d9a04befb 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -3,12 +3,11 @@ #if SUPPORTS_RUNTIME_INTRINSICS using System; -using System.Collections.Generic; +using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; -using System.Text; namespace SixLabors.ImageSharp.Formats.Jpeg.Components { diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 181f18185..6f68881cd 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -1,11 +1,8 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. -using System.Diagnostics; -using System.Numerics; using System.Runtime.CompilerServices; #if SUPPORTS_RUNTIME_INTRINSICS -using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; #endif diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs new file mode 100644 index 000000000..b826193c3 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs @@ -0,0 +1,23 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Formats.Jpeg.Components; + +namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class Block8x8F_Quantize + { + private Block8x8F block = default; + private Block8x8F quant = default; + private Block8x8 result = default; + + [Benchmark] + public short Quantize() + { + Block8x8F.Quantize(ref this.block, ref this.result, ref this.quant); + return this.result[0]; + } + } +} diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs index 8e8787475..47f7d2fbc 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -9,25 +9,27 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class Block8x8F_Transpose { - private static readonly Block8x8F Source = Create8x8FloatData(); + private Block8x8F source = Create8x8FloatData(); [Benchmark] - public void TransposeInto() => Source.Transpose(); + public float TransposeInto() + { + this.source.Transpose(); + return this.source[0]; + } private static Block8x8F Create8x8FloatData() { - float[] result = new float[64]; + Block8x8F block = default; for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) { - result[(i * 8) + j] = (i * 10) + j; + block[(i * 8) + j] = (i * 10) + j; } } - var source = default(Block8x8F); - source.LoadFrom(result); - return source; + return block; } } } diff --git a/tests/ImageSharp.Benchmarks/Program.cs b/tests/ImageSharp.Benchmarks/Program.cs index 8080825d9..f6ffa6f80 100644 --- a/tests/ImageSharp.Benchmarks/Program.cs +++ b/tests/ImageSharp.Benchmarks/Program.cs @@ -1,8 +1,6 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. -using System.Reflection; - using BenchmarkDotNet.Running; namespace SixLabors.ImageSharp.Benchmarks @@ -15,9 +13,8 @@ namespace SixLabors.ImageSharp.Benchmarks /// /// The arguments to pass to the program. /// - public static void Main(string[] args) - { - new BenchmarkSwitcher(typeof(Program).GetTypeInfo().Assembly).Run(args); - } + public static void Main(string[] args) => BenchmarkSwitcher + .FromAssembly(typeof(Program).Assembly) + .Run(args); } } From d21e374e86cca30c97ffbe7f3d31dedbe9d4dc7f Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Fri, 10 Sep 2021 12:27:35 +0300 Subject: [PATCH 30/56] Tidied up the code, added benchmarks --- .../Formats/Jpeg/Components/Block8x8.cs | 2 + .../Jpeg/Components/Block8x8F.Intrinsic.cs | 29 +- .../FastFloatingPointDCT.Intrinsic.cs | 172 ----------- .../Jpeg/Components/FastFloatingPointDCT.cs | 173 +++++++++++ .../Jpeg/Components/ZigZag.Intrinsic.cs | 290 +++++++----------- .../BlockOperations/Block8x8F_Quantize.cs | 31 +- .../BlockOperations/Block8x8F_Transpose.cs | 14 + .../Config.HwIntrinsics.cs | 10 +- 8 files changed, 348 insertions(+), 373 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index c76eb942f..71077675d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -5,8 +5,10 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +#endif using System.Text; namespace SixLabors.ImageSharp.Formats.Jpeg.Components diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs index 83227ff07..733d32892 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -35,33 +35,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components [FieldOffset(224)] public Vector256 V7; - private static ReadOnlySpan DivideIntoInt16_Avx2_ShuffleMask => new int[] { - 0, 1, 4, 5, 2, 3, 6, 7 - }; + private static readonly Vector256 MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); - fixed (int* maskPtr = DivideIntoInt16_Avx2_ShuffleMask) - { - Vector256 crossLaneShuffleMask = Avx.LoadVector256(maskPtr).AsInt32(); - - ref Vector256 aBase = ref Unsafe.As>(ref a); - ref Vector256 bBase = ref Unsafe.As>(ref b); + ref Vector256 aBase = ref a.V0; + ref Vector256 bBase = ref b.V0; - ref Vector256 destBase = ref Unsafe.As>(ref dest); + ref Vector256 destRef = ref dest.V01; - for (int i = 0; i < 8; i += 2) - { - Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + for (int i = 0; i < 8; i += 2) + { + Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); - Vector256 row = Avx2.PackSignedSaturate(row0, row1); - row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16(); + Vector256 row = Avx2.PackSignedSaturate(row0, row1); + row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16(); - Unsafe.Add(ref destBase, i / 2) = row; - } + Unsafe.Add(ref destRef, i / 2) = row; } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index d9a04befb..7a2b0a78c 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -2,7 +2,6 @@ // Licensed under the Apache License, Version 2.0. #if SUPPORTS_RUNTIME_INTRINSICS -using System; using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; @@ -37,42 +36,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private static readonly Vector256 mm256_F_0_7653 = Vector256.Create(0.765366865f); #pragma warning restore SA1310, SA1311, IDE1006 - /// - /// Gets reciprocal coefficients for jpeg quantization tables calculation. - /// - /// - /// - /// Current FDCT implementation expects its results to be multiplied by - /// a reciprocal quantization table. Values in this table must be divided - /// by quantization table values scaled with quality settings. - /// - /// - /// These values were calculates with this formula: - /// - /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8; - /// - /// Where: - /// - /// scalefactor[0] = 1 - /// - /// - /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 - /// - /// Values are also scaled by 8 so DCT code won't do unnecessary division. - /// - /// - public static ReadOnlySpan DctReciprocalAdjustmentCoefficients => new float[] - { - 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, - 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, - 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, - 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, - 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, - 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, - 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, - 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, - }; - /// /// Apply floating point FDCT inplace using simd operations. /// @@ -217,141 +180,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components block.V7 = Avx.Subtract(z11, z4); } - /// - /// Performs 8x8 matrix Inverse Discrete Cosine Transform - /// - /// Source - /// Destination - public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) - { - IDCT8x8_Avx(ref s, ref d); - } - else -#endif - { - IDCT8x4_LeftPart(ref s, ref d); - IDCT8x4_RightPart(ref s, ref d); - } - } - - /// - /// Do IDCT internal operations on the left part of the block. Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// Destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1L; - Vector4 my7 = s.V7L; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3L; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5L; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2L; - Vector4 my6 = s.V6L; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0L; - Vector4 my4 = s.V4L; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0L = my0 + mb0; - d.V7L = my0 - mb0; - d.V1L = my1 + mb1; - d.V6L = my1 - mb1; - d.V2L = my2 + mb2; - d.V5L = my2 - mb2; - d.V3L = my3 + mb3; - d.V4L = my3 - mb3; - } - - /// - /// Do IDCT internal operations on the right part of the block. - /// Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// The destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1R; - Vector4 my7 = s.V7R; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3R; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5R; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2R; - Vector4 my6 = s.V6R; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0R; - Vector4 my4 = s.V4R; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0R = my0 + mb0; - d.V7R = my0 - mb0; - d.V1R = my1 + mb1; - d.V6R = my1 - mb1; - d.V2R = my2 + mb2; - d.V5R = my2 - mb2; - d.V3R = my3 + mb3; - d.V4R = my3 - mb3; - } - /// /// Combined operation of and /// using AVX commands. diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 6f68881cd..91b92d8cf 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -1,6 +1,8 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; +using System.Numerics; using System.Runtime.CompilerServices; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics.X86; @@ -42,6 +44,42 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private const float C_0_125 = 0.1250f; #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore + /// + /// Gets reciprocal coefficients for jpeg quantization tables calculation. + /// + /// + /// + /// Current FDCT implementation expects its results to be multiplied by + /// a reciprocal quantization table. Values in this table must be divided + /// by quantization table values scaled with quality settings. + /// + /// + /// These values were calculates with this formula: + /// + /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8; + /// + /// Where: + /// + /// scalefactor[0] = 1 + /// + /// + /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + /// + /// Values are also scaled by 8 so DCT code won't do unnecessary division. + /// + /// + public static ReadOnlySpan DctReciprocalAdjustmentCoefficients => new float[] + { + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, + 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, + 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, + 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, + 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, + }; + /// /// Apply floating point IDCT inplace. /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. @@ -186,5 +224,140 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components dataRef = ref Unsafe.Add(ref dataRef, 1); } } + + /// + /// Performs 8x8 matrix Inverse Discrete Cosine Transform + /// + /// Source + /// Destination + public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported) + { + IDCT8x8_Avx(ref s, ref d); + } + else +#endif + { + IDCT8x4_LeftPart(ref s, ref d); + IDCT8x4_RightPart(ref s, ref d); + } + } + + /// + /// Do IDCT internal operations on the left part of the block. Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// Destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1L; + Vector4 my7 = s.V7L; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3L; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5L; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2L; + Vector4 my6 = s.V6L; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0L; + Vector4 my4 = s.V4L; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0L = my0 + mb0; + d.V7L = my0 - mb0; + d.V1L = my1 + mb1; + d.V6L = my1 - mb1; + d.V2L = my2 + mb2; + d.V5L = my2 - mb2; + d.V3L = my3 + mb3; + d.V4L = my3 - mb3; + } + + /// + /// Do IDCT internal operations on the right part of the block. + /// Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// The destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1R; + Vector4 my7 = s.V7R; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3R; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5R; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2R; + Vector4 my6 = s.V6R; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0R; + Vector4 my4 = s.V4R; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0R = my0 + mb0; + d.V7R = my0 - mb0; + d.V1R = my1 + mb1; + d.V6R = my1 - mb1; + d.V2R = my2 + mb2; + d.V5R = my2 - mb2; + d.V3R = my3 + mb3; + d.V4R = my3 - mb3; + } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs index 878a67b50..abe02d040 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -23,82 +23,65 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// private static ReadOnlySpan SseShuffleMasks => new byte[] { - // 0_A + // row0 + // A B C 0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _, - // 0_B _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5, - // 0_C _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, - // 1_A + // row1 + // A B C D E _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11, - // 1_B _, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _, - // 1_C 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _, - // 1_D _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _, - // 1_E _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, - // 2_B + // row2 + // B C D E F G 8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _, - // 2_C _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, - // 2_D _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _, - // 2_E _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, - // 2_F _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, - // 2_G _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, - // 3_A + // row3 + // A B C D + // D shuffle mask is the for row4 E row shuffle mask _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _, - // 3_B _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _, - // 3_C _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, - // 3_D/4_E 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, - // 4_F + // row4 + // E F G H + // 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _, - // 4_G _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, - // 4_H _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _, - // 5_B + // row5 + // B C D E F G _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _, - // 5_C _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, - // 5_D 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _, - // 5_E _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, - // 5_F _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _, - // 5_G _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, - // 6_D + // row6 + // D E F G H _, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, - // 6_E _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _, - // 6_F _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13, - // 6_G _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _, - // 6_H 4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, - // 7_F + // row7 + // F G H _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _, - // 7_G 10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, - // 7_H _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15 }; @@ -177,95 +160,95 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components fixed (byte* maskPtr = SseShuffleMasks) { - Vector128 A = source.V0.AsByte(); - Vector128 B = source.V1.AsByte(); - Vector128 C = source.V2.AsByte(); - Vector128 D = source.V3.AsByte(); - Vector128 E = source.V4.AsByte(); - Vector128 F = source.V5.AsByte(); - Vector128 G = source.V6.AsByte(); - Vector128 H = source.V7.AsByte(); + Vector128 rowA = source.V0.AsByte(); + Vector128 rowB = source.V1.AsByte(); + Vector128 rowC = source.V2.AsByte(); + Vector128 rowD = source.V3.AsByte(); + Vector128 rowE = source.V4.AsByte(); + Vector128 rowF = source.V5.AsByte(); + Vector128 rowG = source.V6.AsByte(); + Vector128 rowH = source.V7.AsByte(); // row0 - Vector128 row0_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16(); - Vector128 row0_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16(); - Vector128 row0 = Sse2.Or(row0_A, row0_B); - Vector128 row0_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16(); - row0 = Sse2.Or(row0, row0_C); + Vector128 row0A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16(); + Vector128 row0B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16(); + Vector128 row0 = Sse2.Or(row0A, row0B); + Vector128 row0C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16(); + row0 = Sse2.Or(row0, row0C); // row1 - Vector128 row1_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16(); - Vector128 row1_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16(); - Vector128 row1 = Sse2.Or(row1_A, row1_B); - Vector128 row1_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1_C); - Vector128 row1_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1_D); - Vector128 row1_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1_E); + Vector128 row1A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16(); + Vector128 row1B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16(); + Vector128 row1 = Sse2.Or(row1A, row1B); + Vector128 row1C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1C); + Vector128 row1D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1D); + Vector128 row1E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1E); // row2 - Vector128 row2_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16(); - Vector128 row2_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16(); - Vector128 row2 = Sse2.Or(row2_B, row2_C); - Vector128 row2_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_D); - Vector128 row2_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_E); - Vector128 row2_F = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_F); - Vector128 row2_G = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_G); + Vector128 row2B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16(); + Vector128 row2C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16(); + Vector128 row2 = Sse2.Or(row2B, row2C); + Vector128 row2D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2D); + Vector128 row2E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2E); + Vector128 row2F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2F); + Vector128 row2G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2G); // row3 - Vector128 A_3 = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16(); - Vector128 B_3 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16(); - Vector128 row3 = Sse2.Or(A_3, B_3); - Vector128 C_3 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); - row3 = Sse2.Or(row3, C_3); - Vector128 D3_E4_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16)); - Vector128 D_3 = Ssse3.Shuffle(D, D3_E4_shuffleMask).AsInt16(); - row3 = Sse2.Or(row3, D_3); + Vector128 row3A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16(); + Vector128 row3B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16(); + Vector128 row3 = Sse2.Or(row3A, row3B); + Vector128 row3C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); + row3 = Sse2.Or(row3, row3C); + Vector128 row3D_row4E_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16)); + Vector128 row3D = Ssse3.Shuffle(rowD, row3D_row4E_shuffleMask).AsInt16(); + row3 = Sse2.Or(row3, row3D); // row4 - Vector128 E_4 = Ssse3.Shuffle(E, D3_E4_shuffleMask).AsInt16(); - Vector128 F_4 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16(); - Vector128 row4 = Sse2.Or(E_4, F_4); - Vector128 G_4 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16(); - row4 = Sse2.Or(row4, G_4); - Vector128 H_4 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16(); - row4 = Sse2.Or(row4, H_4); + Vector128 row4E = Ssse3.Shuffle(rowE, row3D_row4E_shuffleMask).AsInt16(); + Vector128 row4F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16(); + Vector128 row4 = Sse2.Or(row4E, row4F); + Vector128 row4G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16(); + row4 = Sse2.Or(row4, row4G); + Vector128 row4H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16(); + row4 = Sse2.Or(row4, row4H); // row5 - Vector128 B_5 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16(); - Vector128 C_5 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16(); - Vector128 row5 = Sse2.Or(B_5, C_5); - Vector128 D_5 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16(); - row5 = Sse2.Or(row5, D_5); - Vector128 E_5 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16(); - row5 = Sse2.Or(row5, E_5); - Vector128 F_5 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16(); - row5 = Sse2.Or(row5, F_5); - Vector128 G_5 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16(); - row5 = Sse2.Or(row5, G_5); + Vector128 row5B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16(); + Vector128 row5C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16(); + Vector128 row5 = Sse2.Or(row5B, row5C); + Vector128 row5D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5D); + Vector128 row5E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5E); + Vector128 row5F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5F); + Vector128 row5G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5G); // row6 - Vector128 D_6 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16(); - Vector128 E_6 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16(); - Vector128 row6 = Sse2.Or(D_6, E_6); - Vector128 F_6 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16(); - row6 = Sse2.Or(row6, F_6); - Vector128 G_6 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16(); - row6 = Sse2.Or(row6, G_6); - Vector128 H_6 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16(); - row6 = Sse2.Or(row6, H_6); + Vector128 row6D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16(); + Vector128 row6E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16(); + Vector128 row6 = Sse2.Or(row6D, row6E); + Vector128 row6F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16(); + row6 = Sse2.Or(row6, row6F); + Vector128 row6G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16(); + row6 = Sse2.Or(row6, row6G); + Vector128 row6H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16(); + row6 = Sse2.Or(row6, row6H); // row7 - Vector128 F_7 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16(); - Vector128 G_7 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16(); - Vector128 row7 = Sse2.Or(F_7, G_7); - Vector128 H_7 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16(); - row7 = Sse2.Or(row7, H_7); + Vector128 row7F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16(); + Vector128 row7G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16(); + Vector128 row7 = Sse2.Or(row7F, row7G); + Vector128 row7H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16(); + row7 = Sse2.Or(row7, row7H); dest.V0 = row0; dest.V1 = row1; @@ -292,105 +275,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components fixed (byte* shuffleVectorsPtr = AvxShuffleMasks) { - // 18 loads - // 10 cross-lane shuffles (permutations) - // 14 shuffles - // 10 bitwise or's - // 4 stores - - // A0 A1 A2 A3 A4 A5 A6 A7 | B0 B1 B2 B3 B4 B5 B6 B7 - // C0 C1 C2 C3 C4 C5 C6 C7 | D0 D1 D2 D3 D4 D5 D6 D7 - // E0 E1 E2 E3 E4 E5 E6 E7 | F0 F1 F2 F3 F4 F5 F6 F7 - // G0 G1 G2 G3 G4 G5 G6 G7 | H0 H1 H2 H3 H4 H5 H6 H7 - Vector256 AB = source.V01.AsByte(); - Vector256 CD = source.V23.AsByte(); - Vector256 EF = source.V45.AsByte(); - Vector256 GH = source.V67.AsByte(); - - // row01 - A0 A1 B0 C0 B1 A2 A3 B2 | C1 D0 E0 D1 C2 B3 A4 A5 - Vector256 AB01_EF01_CD23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); - - // row01_AB - (A0 A1) (B0 B1) (A2 A3) (B2 B3) | (B2 B3) (A4 A5) (X X) (X X) - Vector256 row01_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); - // row01_AB - (A0 A1) (B0 X) (B1 A2) (A3 B2) | (X X) (X X) (X B3) (A4 A5) + Vector256 rowsAB = source.V01.AsByte(); + Vector256 rowsCD = source.V23.AsByte(); + Vector256 rowsEF = source.V45.AsByte(); + Vector256 rowsGH = source.V67.AsByte(); + + // rows 0 1 + Vector256 rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); + Vector256 row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte(); - Vector256 CD01_GH23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32(); - - // row01_CD - (C0 C1) (X X) (X X) (X X) | (C0 C1) (D0 D1) (C2 C3) (X X) - Vector256 row01_CD = Avx2.PermuteVar8x32(CD.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte(); - // row01_CD - (X X) (X C0) (X X) (X X) | (C1 D0) (X D1) (C2 X) (X X) + Vector256 rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32(); + Vector256 row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte(); - // row01_EF - (E0 E1) (E2 E3) (F0 F1) (X X) | (E0 E1) (X X) (X X) (X X) - Vector256 row0123_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); - // row01_EF - (X X) (X X) (X X) (X X) | (X X) (E0 X) (X X) (X X) + Vector256 row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); Vector256 row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte(); Vector256 row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF); - - // row23 - B4 C3 D2 E1 F0 G0 F1 E2 | D3 C4 B5 A6 A7 B6 C5 D4 - - Vector256 AB23_CD45_EF67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); - - // row23_AB - (B4 B5) (X X) (X X) (X X) | (B4 B5) (B6 B7) (A6 A7) (X X) - Vector256 row2345_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); - // row23_AB - (B4 X) (X X) (X X) (X X) | (X X) (B5 A6) (A7 B6) (X X) + // rows 2 3 + Vector256 rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); + Vector256 row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); Vector256 row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte(); - // row23_CD - (C2 C3) (D2 D3) (X X) (X X) | (D2 D3) (C4 C5) (D4 D5) (X X) - Vector256 row23_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); - // row23_CD - (X C3) (D2 X) (X X) (X X) | (D3 C4) (X X) (X X) (C5 D4) + Vector256 row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte(); - // row23_EF - (X X) (X E1) (F0 X) (F1 E2) | (X X) (X X) (X X) (X X) Vector256 row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte(); - // row23_GH - (G0 G1) (G2 G3) (H0 H1) (X X) | (G2 G3) (X X) (X X) (X X) - Vector256 row2345_GH = Avx2.PermuteVar8x32(GH.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte(); - // row23_GH - (X X) (X X) (X G0) (X X) | (X X) (X X) (X X) (X X) + Vector256 row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); Vector256 row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte()); Vector256 row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH)); - - // row45 - E3 F2 G1 H0 H1 G2 F3 E4 | D5 C6 B7 C7 D6 E5 F4 G3 - - // row45_AB - (X X) (X X) (X X) (X X) | (X X) (B7 X) (X X) (X X) + // rows 4 5 Vector256 row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte()); - - // row45_CD - (D6 D7) (X X) (X X) (X X) | (C6 C7) (D4 D5) (D6 D7) (X X) - Vector256 row4567_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); - // row45_CD - (X X) (X X) (X X) (X X) | (D5 C6) (X C7) (D6 X) (X X) + Vector256 row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); Vector256 row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte()); - Vector256 EF45_GH67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32(); - - // row45_EF - (E2 E3) (E4 E5) (F2 F3) (X X) | (E4 E5) (F4 F5) (X X) (X X) - Vector256 row45_EF = Avx2.PermuteVar8x32(EF.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte(); - // row45_EF - (E3 F2) (X X) (X X) (F3 E4) | (X X) (X X) (X E5) (F4 X) + Vector256 rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32(); + Vector256 row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte()); - // row45_GH - (X X) (G1 H0) (H1 G2) (X X) | (X X) (X X) (X X) (X G3) Vector256 row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte()); Vector256 row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH)); - - // row67 - H2 H3 G4 F5 E6 D7 E7 F6 | G5 H4 H5 G6 F7 G7 H6 H7 - - // row67_CD - (X X) (X X) (X D7) (X X) | (X X) (X X) (X X) (X X) + // rows 6 7 Vector256 row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte()); - // row67_EF - (E6 E7) (F4 F5) (F6 F7) (X X) | (F6 F7) (X X) (X X) (X X) - Vector256 row67_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); - // row67_EF - (X X) (X F5) (E6 X) (E7 F6) | (X X) (X X) (F7 X) (X X) + Vector256 row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte()); - // row67_GH - (G4 G5) (H2 H3) (X X) (X X) | (G4 G5) (G6 G7) (H4 H5) (H6 H7) - Vector256 row67_GH = Avx2.PermuteVar8x32(GH.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte(); - // row67_GH - (H2 H3) (G4 X) (X X) (X X) | (G5 H4) (H5 G6) (X G7) (H6 H7) + Vector256 row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte()); Vector256 row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH); diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs index b826193c3..898bbdb45 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs @@ -9,8 +9,8 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class Block8x8F_Quantize { - private Block8x8F block = default; - private Block8x8F quant = default; + private Block8x8F block = CreateFromScalar(1); + private Block8x8F quant = CreateFromScalar(1); private Block8x8 result = default; [Benchmark] @@ -19,5 +19,32 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations Block8x8F.Quantize(ref this.block, ref this.result, ref this.quant); return this.result[0]; } + + private static Block8x8F CreateFromScalar(float scalar) + { + Block8x8F block = default; + for (int i = 0; i < 64; i++) + { + block[i] = scalar; + } + + return block; + } } } + +/* +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update) +Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores +.NET SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 2. SSE : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 3. AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + +| Method | Job | Mean | Error | StdDev | Ratio | +|--------- |-----------------|---------:|---------:|---------:|------:| +| Quantize | No HwIntrinsics | 73.34 ns | 1.081 ns | 1.011 ns | 1.00 | +| Quantize | SSE | 24.11 ns | 0.298 ns | 0.279 ns | 0.33 | +| Quantize | AVX | 15.90 ns | 0.074 ns | 0.065 ns | 0.22 | + */ diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs index 47f7d2fbc..28899b51e 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -33,3 +33,17 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations } } } + +/* +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update) +Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores +.NET SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + +| Method | Job | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated | +|-------------- |---------------- |----------:|----------:|----------:|------:|------:|------:|------:|----------:| +| TransposeInto | No HwIntrinsics | 19.658 ns | 0.0550 ns | 0.0515 ns | 1.00 | - | - | - | - | +| TransposeInto | AVX | 8.613 ns | 0.0249 ns | 0.0208 ns | 0.44 | - | - | - | - | +*/ diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs index 5ceb4c8a0..ffe0f4c02 100644 --- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs +++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs @@ -65,17 +65,17 @@ namespace SixLabors.ImageSharp.Benchmarks .WithId("1. No HwIntrinsics").AsBaseline()); #if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) + if (Sse.IsSupported) { this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithId("2. AVX")); + .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) + .WithId("2. SSE")); } - if (Sse.IsSupported) + if (Avx.IsSupported) { this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) - .WithId("3. SSE")); + .WithId("3. AVX")); } #endif } From f297fce021ef03e988d7c61c5641e78bcdb895bd Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Fri, 10 Sep 2021 12:27:35 +0300 Subject: [PATCH 31/56] Tidied up the code, added benchmarks --- .../Formats/Jpeg/Components/Block8x8.cs | 2 + .../Jpeg/Components/Block8x8F.Intrinsic.cs | 29 +- .../Components/Encoder/HuffmanScanEncoder.cs | 8 +- .../FastFloatingPointDCT.Intrinsic.cs | 172 ----------- .../Jpeg/Components/FastFloatingPointDCT.cs | 173 +++++++++++ .../Jpeg/Components/ZigZag.Intrinsic.cs | 290 +++++++----------- .../BlockOperations/Block8x8F_Quantize.cs | 31 +- .../BlockOperations/Block8x8F_Transpose.cs | 14 + .../Config.HwIntrinsics.cs | 10 +- 9 files changed, 352 insertions(+), 377 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index c76eb942f..71077675d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -5,8 +5,10 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +#endif using System.Text; namespace SixLabors.ImageSharp.Formats.Jpeg.Components diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs index 83227ff07..733d32892 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -35,33 +35,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components [FieldOffset(224)] public Vector256 V7; - private static ReadOnlySpan DivideIntoInt16_Avx2_ShuffleMask => new int[] { - 0, 1, 4, 5, 2, 3, 6, 7 - }; + private static readonly Vector256 MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); - fixed (int* maskPtr = DivideIntoInt16_Avx2_ShuffleMask) - { - Vector256 crossLaneShuffleMask = Avx.LoadVector256(maskPtr).AsInt32(); - - ref Vector256 aBase = ref Unsafe.As>(ref a); - ref Vector256 bBase = ref Unsafe.As>(ref b); + ref Vector256 aBase = ref a.V0; + ref Vector256 bBase = ref b.V0; - ref Vector256 destBase = ref Unsafe.As>(ref dest); + ref Vector256 destRef = ref dest.V01; - for (int i = 0; i < 8; i += 2) - { - Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + for (int i = 0; i < 8; i += 2) + { + Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); - Vector256 row = Avx2.PackSignedSaturate(row0, row1); - row = Avx2.PermuteVar8x32(row.AsInt32(), crossLaneShuffleMask).AsInt16(); + Vector256 row = Avx2.PackSignedSaturate(row0, row1); + row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16(); - Unsafe.Add(ref destBase, i / 2) = row; - } + Unsafe.Add(ref destRef, i / 2) = row; } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index da4723e21..75f384848 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -130,7 +130,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder where TPixel : unmanaged, IPixel { // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < 64; i++) + for (int i = 0; i < Block8x8F.Size; i++) { luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i]; @@ -197,7 +197,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder where TPixel : unmanaged, IPixel { // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < 64; i++) + for (int i = 0; i < Block8x8F.Size; i++) { luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i]; @@ -270,7 +270,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder where TPixel : unmanaged, IPixel { // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < 64; i++) + for (int i = 0; i < Block8x8F.Size; i++) { luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; } @@ -321,7 +321,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder where TPixel : unmanaged, IPixel { // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < 64; i++) + for (int i = 0; i < Block8x8F.Size; i++) { luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index d9a04befb..7a2b0a78c 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -2,7 +2,6 @@ // Licensed under the Apache License, Version 2.0. #if SUPPORTS_RUNTIME_INTRINSICS -using System; using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; @@ -37,42 +36,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private static readonly Vector256 mm256_F_0_7653 = Vector256.Create(0.765366865f); #pragma warning restore SA1310, SA1311, IDE1006 - /// - /// Gets reciprocal coefficients for jpeg quantization tables calculation. - /// - /// - /// - /// Current FDCT implementation expects its results to be multiplied by - /// a reciprocal quantization table. Values in this table must be divided - /// by quantization table values scaled with quality settings. - /// - /// - /// These values were calculates with this formula: - /// - /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8; - /// - /// Where: - /// - /// scalefactor[0] = 1 - /// - /// - /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 - /// - /// Values are also scaled by 8 so DCT code won't do unnecessary division. - /// - /// - public static ReadOnlySpan DctReciprocalAdjustmentCoefficients => new float[] - { - 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, - 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, - 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, - 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, - 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, - 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, - 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, - 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, - }; - /// /// Apply floating point FDCT inplace using simd operations. /// @@ -217,141 +180,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components block.V7 = Avx.Subtract(z11, z4); } - /// - /// Performs 8x8 matrix Inverse Discrete Cosine Transform - /// - /// Source - /// Destination - public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) - { - IDCT8x8_Avx(ref s, ref d); - } - else -#endif - { - IDCT8x4_LeftPart(ref s, ref d); - IDCT8x4_RightPart(ref s, ref d); - } - } - - /// - /// Do IDCT internal operations on the left part of the block. Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// Destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1L; - Vector4 my7 = s.V7L; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3L; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5L; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2L; - Vector4 my6 = s.V6L; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0L; - Vector4 my4 = s.V4L; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0L = my0 + mb0; - d.V7L = my0 - mb0; - d.V1L = my1 + mb1; - d.V6L = my1 - mb1; - d.V2L = my2 + mb2; - d.V5L = my2 - mb2; - d.V3L = my3 + mb3; - d.V4L = my3 - mb3; - } - - /// - /// Do IDCT internal operations on the right part of the block. - /// Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// The destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1R; - Vector4 my7 = s.V7R; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3R; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5R; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2R; - Vector4 my6 = s.V6R; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0R; - Vector4 my4 = s.V4R; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0R = my0 + mb0; - d.V7R = my0 - mb0; - d.V1R = my1 + mb1; - d.V6R = my1 - mb1; - d.V2R = my2 + mb2; - d.V5R = my2 - mb2; - d.V3R = my3 + mb3; - d.V4R = my3 - mb3; - } - /// /// Combined operation of and /// using AVX commands. diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 6f68881cd..1c5cfc8d6 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -1,6 +1,8 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; +using System.Numerics; using System.Runtime.CompilerServices; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics.X86; @@ -42,6 +44,42 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private const float C_0_125 = 0.1250f; #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore + /// + /// Gets reciprocal coefficients for jpeg quantization tables calculation. + /// + /// + /// + /// Current FDCT implementation expects its results to be multiplied by + /// a reciprocal quantization table. To get 8x8 reciprocal block values in this + /// table must be divided by quantization table values scaled with quality settings. + /// + /// + /// These values were calculates with this formula: + /// + /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8; + /// + /// Where: + /// + /// scalefactor[0] = 1 + /// + /// + /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + /// + /// Values are also scaled by 8 so DCT code won't do unnecessary division. + /// + /// + public static ReadOnlySpan DctReciprocalAdjustmentCoefficients => new float[] + { + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, + 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, + 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, + 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, + 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, + }; + /// /// Apply floating point IDCT inplace. /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. @@ -186,5 +224,140 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components dataRef = ref Unsafe.Add(ref dataRef, 1); } } + + /// + /// Performs 8x8 matrix Inverse Discrete Cosine Transform + /// + /// Source + /// Destination + public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported) + { + IDCT8x8_Avx(ref s, ref d); + } + else +#endif + { + IDCT8x4_LeftPart(ref s, ref d); + IDCT8x4_RightPart(ref s, ref d); + } + } + + /// + /// Do IDCT internal operations on the left part of the block. Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// Destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1L; + Vector4 my7 = s.V7L; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3L; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5L; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2L; + Vector4 my6 = s.V6L; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0L; + Vector4 my4 = s.V4L; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0L = my0 + mb0; + d.V7L = my0 - mb0; + d.V1L = my1 + mb1; + d.V6L = my1 - mb1; + d.V2L = my2 + mb2; + d.V5L = my2 - mb2; + d.V3L = my3 + mb3; + d.V4L = my3 - mb3; + } + + /// + /// Do IDCT internal operations on the right part of the block. + /// Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// The destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1R; + Vector4 my7 = s.V7R; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3R; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5R; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2R; + Vector4 my6 = s.V6R; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0R; + Vector4 my4 = s.V4R; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0R = my0 + mb0; + d.V7R = my0 - mb0; + d.V1R = my1 + mb1; + d.V6R = my1 - mb1; + d.V2R = my2 + mb2; + d.V5R = my2 - mb2; + d.V3R = my3 + mb3; + d.V4R = my3 - mb3; + } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs index 878a67b50..abe02d040 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -23,82 +23,65 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// private static ReadOnlySpan SseShuffleMasks => new byte[] { - // 0_A + // row0 + // A B C 0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _, - // 0_B _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5, - // 0_C _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, - // 1_A + // row1 + // A B C D E _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11, - // 1_B _, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _, - // 1_C 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _, - // 1_D _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _, - // 1_E _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, - // 2_B + // row2 + // B C D E F G 8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _, - // 2_C _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, - // 2_D _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _, - // 2_E _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, - // 2_F _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, - // 2_G _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, - // 3_A + // row3 + // A B C D + // D shuffle mask is the for row4 E row shuffle mask _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _, - // 3_B _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _, - // 3_C _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, - // 3_D/4_E 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, - // 4_F + // row4 + // E F G H + // 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _, - // 4_G _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, - // 4_H _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _, - // 5_B + // row5 + // B C D E F G _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _, - // 5_C _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, - // 5_D 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _, - // 5_E _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, - // 5_F _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _, - // 5_G _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, - // 6_D + // row6 + // D E F G H _, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, - // 6_E _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _, - // 6_F _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13, - // 6_G _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _, - // 6_H 4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, - // 7_F + // row7 + // F G H _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _, - // 7_G 10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, - // 7_H _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15 }; @@ -177,95 +160,95 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components fixed (byte* maskPtr = SseShuffleMasks) { - Vector128 A = source.V0.AsByte(); - Vector128 B = source.V1.AsByte(); - Vector128 C = source.V2.AsByte(); - Vector128 D = source.V3.AsByte(); - Vector128 E = source.V4.AsByte(); - Vector128 F = source.V5.AsByte(); - Vector128 G = source.V6.AsByte(); - Vector128 H = source.V7.AsByte(); + Vector128 rowA = source.V0.AsByte(); + Vector128 rowB = source.V1.AsByte(); + Vector128 rowC = source.V2.AsByte(); + Vector128 rowD = source.V3.AsByte(); + Vector128 rowE = source.V4.AsByte(); + Vector128 rowF = source.V5.AsByte(); + Vector128 rowG = source.V6.AsByte(); + Vector128 rowH = source.V7.AsByte(); // row0 - Vector128 row0_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16(); - Vector128 row0_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16(); - Vector128 row0 = Sse2.Or(row0_A, row0_B); - Vector128 row0_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16(); - row0 = Sse2.Or(row0, row0_C); + Vector128 row0A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16(); + Vector128 row0B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16(); + Vector128 row0 = Sse2.Or(row0A, row0B); + Vector128 row0C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16(); + row0 = Sse2.Or(row0, row0C); // row1 - Vector128 row1_A = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16(); - Vector128 row1_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16(); - Vector128 row1 = Sse2.Or(row1_A, row1_B); - Vector128 row1_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1_C); - Vector128 row1_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1_D); - Vector128 row1_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1_E); + Vector128 row1A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16(); + Vector128 row1B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16(); + Vector128 row1 = Sse2.Or(row1A, row1B); + Vector128 row1C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1C); + Vector128 row1D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1D); + Vector128 row1E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16(); + row1 = Sse2.Or(row1, row1E); // row2 - Vector128 row2_B = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16(); - Vector128 row2_C = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16(); - Vector128 row2 = Sse2.Or(row2_B, row2_C); - Vector128 row2_D = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_D); - Vector128 row2_E = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_E); - Vector128 row2_F = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_F); - Vector128 row2_G = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2_G); + Vector128 row2B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16(); + Vector128 row2C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16(); + Vector128 row2 = Sse2.Or(row2B, row2C); + Vector128 row2D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2D); + Vector128 row2E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2E); + Vector128 row2F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2F); + Vector128 row2G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16(); + row2 = Sse2.Or(row2, row2G); // row3 - Vector128 A_3 = Ssse3.Shuffle(A, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16(); - Vector128 B_3 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16(); - Vector128 row3 = Sse2.Or(A_3, B_3); - Vector128 C_3 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); - row3 = Sse2.Or(row3, C_3); - Vector128 D3_E4_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16)); - Vector128 D_3 = Ssse3.Shuffle(D, D3_E4_shuffleMask).AsInt16(); - row3 = Sse2.Or(row3, D_3); + Vector128 row3A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16(); + Vector128 row3B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16(); + Vector128 row3 = Sse2.Or(row3A, row3B); + Vector128 row3C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); + row3 = Sse2.Or(row3, row3C); + Vector128 row3D_row4E_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16)); + Vector128 row3D = Ssse3.Shuffle(rowD, row3D_row4E_shuffleMask).AsInt16(); + row3 = Sse2.Or(row3, row3D); // row4 - Vector128 E_4 = Ssse3.Shuffle(E, D3_E4_shuffleMask).AsInt16(); - Vector128 F_4 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16(); - Vector128 row4 = Sse2.Or(E_4, F_4); - Vector128 G_4 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16(); - row4 = Sse2.Or(row4, G_4); - Vector128 H_4 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16(); - row4 = Sse2.Or(row4, H_4); + Vector128 row4E = Ssse3.Shuffle(rowE, row3D_row4E_shuffleMask).AsInt16(); + Vector128 row4F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16(); + Vector128 row4 = Sse2.Or(row4E, row4F); + Vector128 row4G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16(); + row4 = Sse2.Or(row4, row4G); + Vector128 row4H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16(); + row4 = Sse2.Or(row4, row4H); // row5 - Vector128 B_5 = Ssse3.Shuffle(B, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16(); - Vector128 C_5 = Ssse3.Shuffle(C, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16(); - Vector128 row5 = Sse2.Or(B_5, C_5); - Vector128 D_5 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16(); - row5 = Sse2.Or(row5, D_5); - Vector128 E_5 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16(); - row5 = Sse2.Or(row5, E_5); - Vector128 F_5 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16(); - row5 = Sse2.Or(row5, F_5); - Vector128 G_5 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16(); - row5 = Sse2.Or(row5, G_5); + Vector128 row5B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16(); + Vector128 row5C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16(); + Vector128 row5 = Sse2.Or(row5B, row5C); + Vector128 row5D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5D); + Vector128 row5E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5E); + Vector128 row5F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5F); + Vector128 row5G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16(); + row5 = Sse2.Or(row5, row5G); // row6 - Vector128 D_6 = Ssse3.Shuffle(D, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16(); - Vector128 E_6 = Ssse3.Shuffle(E, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16(); - Vector128 row6 = Sse2.Or(D_6, E_6); - Vector128 F_6 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16(); - row6 = Sse2.Or(row6, F_6); - Vector128 G_6 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16(); - row6 = Sse2.Or(row6, G_6); - Vector128 H_6 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16(); - row6 = Sse2.Or(row6, H_6); + Vector128 row6D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16(); + Vector128 row6E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16(); + Vector128 row6 = Sse2.Or(row6D, row6E); + Vector128 row6F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16(); + row6 = Sse2.Or(row6, row6F); + Vector128 row6G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16(); + row6 = Sse2.Or(row6, row6G); + Vector128 row6H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16(); + row6 = Sse2.Or(row6, row6H); // row7 - Vector128 F_7 = Ssse3.Shuffle(F, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16(); - Vector128 G_7 = Ssse3.Shuffle(G, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16(); - Vector128 row7 = Sse2.Or(F_7, G_7); - Vector128 H_7 = Ssse3.Shuffle(H, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16(); - row7 = Sse2.Or(row7, H_7); + Vector128 row7F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16(); + Vector128 row7G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16(); + Vector128 row7 = Sse2.Or(row7F, row7G); + Vector128 row7H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16(); + row7 = Sse2.Or(row7, row7H); dest.V0 = row0; dest.V1 = row1; @@ -292,105 +275,60 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components fixed (byte* shuffleVectorsPtr = AvxShuffleMasks) { - // 18 loads - // 10 cross-lane shuffles (permutations) - // 14 shuffles - // 10 bitwise or's - // 4 stores - - // A0 A1 A2 A3 A4 A5 A6 A7 | B0 B1 B2 B3 B4 B5 B6 B7 - // C0 C1 C2 C3 C4 C5 C6 C7 | D0 D1 D2 D3 D4 D5 D6 D7 - // E0 E1 E2 E3 E4 E5 E6 E7 | F0 F1 F2 F3 F4 F5 F6 F7 - // G0 G1 G2 G3 G4 G5 G6 G7 | H0 H1 H2 H3 H4 H5 H6 H7 - Vector256 AB = source.V01.AsByte(); - Vector256 CD = source.V23.AsByte(); - Vector256 EF = source.V45.AsByte(); - Vector256 GH = source.V67.AsByte(); - - // row01 - A0 A1 B0 C0 B1 A2 A3 B2 | C1 D0 E0 D1 C2 B3 A4 A5 - Vector256 AB01_EF01_CD23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); - - // row01_AB - (A0 A1) (B0 B1) (A2 A3) (B2 B3) | (B2 B3) (A4 A5) (X X) (X X) - Vector256 row01_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); - // row01_AB - (A0 A1) (B0 X) (B1 A2) (A3 B2) | (X X) (X X) (X B3) (A4 A5) + Vector256 rowsAB = source.V01.AsByte(); + Vector256 rowsCD = source.V23.AsByte(); + Vector256 rowsEF = source.V45.AsByte(); + Vector256 rowsGH = source.V67.AsByte(); + + // rows 0 1 + Vector256 rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); + Vector256 row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte(); - Vector256 CD01_GH23_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32(); - - // row01_CD - (C0 C1) (X X) (X X) (X X) | (C0 C1) (D0 D1) (C2 C3) (X X) - Vector256 row01_CD = Avx2.PermuteVar8x32(CD.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte(); - // row01_CD - (X X) (X C0) (X X) (X X) | (C1 D0) (X D1) (C2 X) (X X) + Vector256 rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32(); + Vector256 row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte(); - // row01_EF - (E0 E1) (E2 E3) (F0 F1) (X X) | (E0 E1) (X X) (X X) (X X) - Vector256 row0123_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); - // row01_EF - (X X) (X X) (X X) (X X) | (X X) (E0 X) (X X) (X X) + Vector256 row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); Vector256 row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte(); Vector256 row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF); - - // row23 - B4 C3 D2 E1 F0 G0 F1 E2 | D3 C4 B5 A6 A7 B6 C5 D4 - - Vector256 AB23_CD45_EF67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); - - // row23_AB - (B4 B5) (X X) (X X) (X X) | (B4 B5) (B6 B7) (A6 A7) (X X) - Vector256 row2345_AB = Avx2.PermuteVar8x32(AB.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); - // row23_AB - (B4 X) (X X) (X X) (X X) | (X X) (B5 A6) (A7 B6) (X X) + // rows 2 3 + Vector256 rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); + Vector256 row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); Vector256 row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte(); - // row23_CD - (C2 C3) (D2 D3) (X X) (X X) | (D2 D3) (C4 C5) (D4 D5) (X X) - Vector256 row23_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB01_EF01_CD23_cr_ln_shfmask).AsByte(); - // row23_CD - (X C3) (D2 X) (X X) (X X) | (D3 C4) (X X) (X X) (C5 D4) + Vector256 row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte(); - // row23_EF - (X X) (X E1) (F0 X) (F1 E2) | (X X) (X X) (X X) (X X) Vector256 row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte(); - // row23_GH - (G0 G1) (G2 G3) (H0 H1) (X X) | (G2 G3) (X X) (X X) (X X) - Vector256 row2345_GH = Avx2.PermuteVar8x32(GH.AsInt32(), CD01_GH23_cr_ln_shfmask).AsByte(); - // row23_GH - (X X) (X X) (X G0) (X X) | (X X) (X X) (X X) (X X) + Vector256 row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); Vector256 row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte()); Vector256 row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH)); - - // row45 - E3 F2 G1 H0 H1 G2 F3 E4 | D5 C6 B7 C7 D6 E5 F4 G3 - - // row45_AB - (X X) (X X) (X X) (X X) | (X X) (B7 X) (X X) (X X) + // rows 4 5 Vector256 row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte()); - - // row45_CD - (D6 D7) (X X) (X X) (X X) | (C6 C7) (D4 D5) (D6 D7) (X X) - Vector256 row4567_CD = Avx2.PermuteVar8x32(CD.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); - // row45_CD - (X X) (X X) (X X) (X X) | (D5 C6) (X C7) (D6 X) (X X) + Vector256 row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); Vector256 row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte()); - Vector256 EF45_GH67_cr_ln_shfmask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32(); - - // row45_EF - (E2 E3) (E4 E5) (F2 F3) (X X) | (E4 E5) (F4 F5) (X X) (X X) - Vector256 row45_EF = Avx2.PermuteVar8x32(EF.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte(); - // row45_EF - (E3 F2) (X X) (X X) (F3 E4) | (X X) (X X) (X E5) (F4 X) + Vector256 rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32(); + Vector256 row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte()); - // row45_GH - (X X) (G1 H0) (H1 G2) (X X) | (X X) (X X) (X X) (X G3) Vector256 row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte()); Vector256 row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH)); - - // row67 - H2 H3 G4 F5 E6 D7 E7 F6 | G5 H4 H5 G6 F7 G7 H6 H7 - - // row67_CD - (X X) (X X) (X D7) (X X) | (X X) (X X) (X X) (X X) + // rows 6 7 Vector256 row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte()); - // row67_EF - (E6 E7) (F4 F5) (F6 F7) (X X) | (F6 F7) (X X) (X X) (X X) - Vector256 row67_EF = Avx2.PermuteVar8x32(EF.AsInt32(), AB23_CD45_EF67_cr_ln_shfmask).AsByte(); - // row67_EF - (X X) (X F5) (E6 X) (E7 F6) | (X X) (X X) (F7 X) (X X) + Vector256 row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte()); - // row67_GH - (G4 G5) (H2 H3) (X X) (X X) | (G4 G5) (G6 G7) (H4 H5) (H6 H7) - Vector256 row67_GH = Avx2.PermuteVar8x32(GH.AsInt32(), EF45_GH67_cr_ln_shfmask).AsByte(); - // row67_GH - (H2 H3) (G4 X) (X X) (X X) | (G5 H4) (H5 G6) (X G7) (H6 H7) + Vector256 row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte()); Vector256 row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH); diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs index b826193c3..898bbdb45 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs @@ -9,8 +9,8 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class Block8x8F_Quantize { - private Block8x8F block = default; - private Block8x8F quant = default; + private Block8x8F block = CreateFromScalar(1); + private Block8x8F quant = CreateFromScalar(1); private Block8x8 result = default; [Benchmark] @@ -19,5 +19,32 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations Block8x8F.Quantize(ref this.block, ref this.result, ref this.quant); return this.result[0]; } + + private static Block8x8F CreateFromScalar(float scalar) + { + Block8x8F block = default; + for (int i = 0; i < 64; i++) + { + block[i] = scalar; + } + + return block; + } } } + +/* +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update) +Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores +.NET SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 2. SSE : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 3. AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + +| Method | Job | Mean | Error | StdDev | Ratio | +|--------- |-----------------|---------:|---------:|---------:|------:| +| Quantize | No HwIntrinsics | 73.34 ns | 1.081 ns | 1.011 ns | 1.00 | +| Quantize | SSE | 24.11 ns | 0.298 ns | 0.279 ns | 0.33 | +| Quantize | AVX | 15.90 ns | 0.074 ns | 0.065 ns | 0.22 | + */ diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs index 47f7d2fbc..28899b51e 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -33,3 +33,17 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations } } } + +/* +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update) +Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores +.NET SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + +| Method | Job | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated | +|-------------- |---------------- |----------:|----------:|----------:|------:|------:|------:|------:|----------:| +| TransposeInto | No HwIntrinsics | 19.658 ns | 0.0550 ns | 0.0515 ns | 1.00 | - | - | - | - | +| TransposeInto | AVX | 8.613 ns | 0.0249 ns | 0.0208 ns | 0.44 | - | - | - | - | +*/ diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs index 5ceb4c8a0..ffe0f4c02 100644 --- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs +++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs @@ -65,17 +65,17 @@ namespace SixLabors.ImageSharp.Benchmarks .WithId("1. No HwIntrinsics").AsBaseline()); #if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) + if (Sse.IsSupported) { this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithId("2. AVX")); + .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) + .WithId("2. SSE")); } - if (Sse.IsSupported) + if (Avx.IsSupported) { this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) - .WithId("3. SSE")); + .WithId("3. AVX")); } #endif } From 96f8717b12599af180aafd8c3915eea09811c204 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sat, 11 Sep 2021 06:13:05 +0300 Subject: [PATCH 32/56] Optimized runLength calculation --- .../Components/Encoder/HuffmanScanEncoder.cs | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 75f384848..ad279b577 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -408,22 +408,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // Emit the AC components. int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values; - int runLength = 0; int lastValuableIndex = spectralBlock.GetLastNonZeroIndex(); + + int runLength = 0; for (int zig = 1; zig <= lastValuableIndex; zig++) { - int ac = spectralBlock[zig]; + const int zeroRun1 = 1 << 4; + const int zeroRun16 = 16 << 4; + int ac = spectralBlock[zig]; if (ac == 0) { - runLength++; + runLength += zeroRun1; } else { - while (runLength > 15) + while (runLength >= zeroRun16) { this.EmitHuff(acHuffTable, 0xf0); - runLength -= 16; + runLength -= zeroRun16; } this.EmitHuffRLE(acHuffTable, runLength, ac); @@ -498,14 +501,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } /// - /// Emits a run of runLength copies of value encoded with the given Huffman encoder. + /// Emits given value via huffman rle encoding. /// /// Compiled Huffman spec values. - /// The number of copies to encode. + /// The number of preceding zeroes, preshifted by 4 to the left. /// The value to encode. [MethodImpl(InliningOptions.ShortMethod)] private void EmitHuffRLE(int[] table, int runLength, int value) { + DebugGuard.IsTrue((runLength & 0xf) == 0, $"{nameof(runLength)} parameter must be shifted to the left by 4 bits"); + int a = value; int b = value; if (a < 0) @@ -517,7 +522,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder int valueLen = GetHuffmanEncodingLength((uint)a); // Huffman prefix code - int huffPackage = table[(runLength << 4) | valueLen]; + int huffPackage = table[runLength | valueLen]; int prefixLen = huffPackage & 0xff; uint prefix = (uint)huffPackage & 0xffff_0000u; From 91a95b581404b9f32f773e6672c3c98b9f4cfb48 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Sun, 12 Sep 2021 21:44:11 +0300 Subject: [PATCH 33/56] Implemented fallback code for big-endian machines --- .../Components/Encoder/HuffmanScanEncoder.cs | 101 +++++++++++++----- 1 file changed, 77 insertions(+), 24 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index ad279b577..08f676e40 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -445,21 +445,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder return dc; } - [MethodImpl(InliningOptions.ShortMethod)] - private void FlushRemainingBytes() - { - // Bytes count we want to write to the output stream - int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8); - - // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits - uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount); - - int writeIndex = this.emitWriteIndex; - this.emitBuffer[writeIndex - 1] = packedBytes; - - this.FlushToStream((writeIndex * 4) - valuableBytesCount); - } - /// /// Emits the least significant count of bits to the stream write buffer. /// The precondition is bits @@ -568,28 +553,96 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder #endif } + /// + /// Flushes cached bytes to the ouput stream respecting stuff bytes. + /// + /// + /// Bytes cached via are stored in 4-bytes blocks which makes + /// this method endianness dependent. + /// [MethodImpl(InliningOptions.ShortMethod)] - private void FlushToStream() => this.FlushToStream(this.emitWriteIndex * 4); - - [MethodImpl(InliningOptions.ShortMethod)] - private void FlushToStream(int endIndex) + private void FlushToStream() { Span emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan()); int writeIdx = 0; int startIndex = emitBytes.Length - 1; - for (int i = startIndex; i >= endIndex; i--) + int endIndex = this.emitWriteIndex * sizeof(uint); + + // Some platforms may fail to eliminate this if-else branching + // Even if it happens - buffer is flushed in big packs, + // branching overhead shouldn't be noticeable + if (BitConverter.IsLittleEndian) { - byte value = emitBytes[i]; - this.streamWriteBuffer[writeIdx++] = value; - if (value == 0xff) + // For little endian case bytes are ordered and can be + // safely written to the stream with stuff bytes + // First byte is cached on the most significant index + // so we are going from the end of the array to its beginning: + // ... [ double word #1 ] [ double word #0 ] + // ... [idx3|idx2|idx1|idx0] [idx3|idx2|idx1|idx0] + for (int i = startIndex; i >= endIndex; i--) { - this.streamWriteBuffer[writeIdx++] = 0x00; + byte value = emitBytes[i]; + this.streamWriteBuffer[writeIdx++] = value; + + // Inserting stuff byte + if (value == 0xff) + { + this.streamWriteBuffer[writeIdx++] = 0x00; + } + } + } + else + { + // For big endian case bytes are ordered in 4-byte packs + // which are ordered like bytes in the little endian case by in 4-byte packs: + // ... [ double word #1 ] [ double word #0 ] + // ... [idx0|idx1|idx2|idx3] [idx0|idx1|idx2|idx3] + // So we must write each 4-bytes in 'natural order' + for (int i = startIndex; i >= endIndex; i -= 4) + { + // This loop is caused by the nature of underlying byte buffer + // implementation and indeed causes performace by somewhat 5% + // compared to little endian scenario + // Even with this performance drop this cached buffer implementation + // is faster than individually writing bytes using binary shifts and binary and(s) + for (int j = i - 3; j <= i; j++) + { + byte value = emitBytes[j]; + this.streamWriteBuffer[writeIdx++] = value; + + // Inserting stuff byte + if (value == 0xff) + { + this.streamWriteBuffer[writeIdx++] = 0x00; + } + } } } this.target.Write(this.streamWriteBuffer, 0, writeIdx); this.emitWriteIndex = this.emitBuffer.Length; } + + [MethodImpl(InliningOptions.ShortMethod)] + private void FlushRemainingBytes() + { + // Flush full 4-byte blocks + this.FlushToStream(); + + // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits + // And writing only valuable count of bytes count we want to write to the output stream + int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8); + uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount); + + Span emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan()); + for (int i = 0; i < valuableBytesCount; i++) + { + emitBytes[i] = (byte)((packedBytes >> ((3 - i) * 8)) & 0xff); + } + + // Flush remaining 'tail' bytes + this.target.Write(emitBytes, 0, valuableBytesCount); + } } } From 775610d5a0221e11096bbe500adc5bd31d6cbe63 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 13 Sep 2021 00:35:29 +0300 Subject: [PATCH 34/56] Fixed tests, fixed compilation, added DHT marker decoding more meaningful exception messages, fixed invalid jpeg encoding --- .../Components/Encoder/HuffmanScanEncoder.cs | 22 ++++++++----------- .../Formats/Jpeg/JpegDecoderCore.cs | 4 ++-- .../ImageSharp.Tests/Formats/Jpg/DCTTests.cs | 20 +++++------------ 3 files changed, 17 insertions(+), 29 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 08f676e40..3e6b0e5f4 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -561,13 +561,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// this method endianness dependent. /// [MethodImpl(InliningOptions.ShortMethod)] - private void FlushToStream() + private void FlushToStream(int endIndex) { Span emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan()); int writeIdx = 0; int startIndex = emitBytes.Length - 1; - int endIndex = this.emitWriteIndex * sizeof(uint); // Some platforms may fail to eliminate this if-else branching // Even if it happens - buffer is flushed in big packs, @@ -621,28 +620,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } this.target.Write(this.streamWriteBuffer, 0, writeIdx); + } + + private void FlushToStream() + { + this.FlushToStream(this.emitWriteIndex * 4); this.emitWriteIndex = this.emitBuffer.Length; } [MethodImpl(InliningOptions.ShortMethod)] private void FlushRemainingBytes() { - // Flush full 4-byte blocks - this.FlushToStream(); - // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits // And writing only valuable count of bytes count we want to write to the output stream int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8); uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount); + this.emitBuffer[--this.emitWriteIndex] = packedBytes; - Span emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan()); - for (int i = 0; i < valuableBytesCount; i++) - { - emitBytes[i] = (byte)((packedBytes >> ((3 - i) * 8)) & 0xff); - } - - // Flush remaining 'tail' bytes - this.target.Write(emitBytes, 0, valuableBytesCount); + // Flush cached bytes to the output stream with padding bits + this.FlushToStream((this.emitWriteIndex * 4) - 4 + valuableBytesCount); } } } diff --git a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs index 024743ddb..a0f69bb7b 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs @@ -1071,13 +1071,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg // Types 0..1 DC..AC if (tableType > 1) { - JpegThrowHelper.ThrowInvalidImageContentException("Bad Huffman Table type."); + JpegThrowHelper.ThrowInvalidImageContentException($"Bad huffman table type: {tableType}"); } // Max tables of each type if (tableIndex > 3) { - JpegThrowHelper.ThrowInvalidImageContentException("Bad Huffman Table index."); + JpegThrowHelper.ThrowInvalidImageContentException($"Bad huffman table index: {tableIndex}"); } stream.Read(huffmanDataSpan, 0, 16); diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs index 55d208c5a..b4d3769d7 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs @@ -2,9 +2,6 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics.X86; #endif @@ -121,24 +118,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg public void IDCT8x8_Avx(int seed) { #if SUPPORTS_RUNTIME_INTRINSICS - var skip = !Avx.IsSupported; -#else - var skip = true; -#endif - - if (skip) + if (!Avx.IsSupported) { this.Output.WriteLine("No AVX present, skipping test!"); - return; } Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); + Block8x8F srcBlock = default; srcBlock.LoadFrom(src); - var destBlock = default(Block8x8F); + Block8x8F destBlock = default; - var expectedDest = new float[64]; + float[] expectedDest = new float[64]; // reference, left part ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest); @@ -149,10 +140,11 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg // testee, whole 8x8 FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock); - var actualDest = new float[64]; + float[] actualDest = new float[64]; destBlock.ScaledCopyTo(actualDest); Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); +#endif } [Theory] From a7dada1d4d47260b1f82ba4df310d9698cf7542a Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 13 Sep 2021 00:44:02 +0300 Subject: [PATCH 35/56] Fixed huffman lut summary --- .../Jpeg/Components/Encoder/HuffmanLut.cs | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs index f563e74e0..44b39dfd7 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs @@ -4,12 +4,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { /// - /// TODO: THIS IS NO LONGER TRUE, INTERNAL REPRESENTATION WAS CHANGED AND THIS DOC SHOULD BE CHANGED TOO!!! /// A compiled look-up table representation of a huffmanSpec. - /// Each value maps to a int32 of which the 24 most significant bits hold the - /// codeword in bits and the 8 least significant bits hold the codeword size. /// The maximum codeword size is 16 bits. /// + /// + /// + /// Each value maps to a int32 of which the 24 most significant bits hold the + /// codeword in bits and the 8 least significant bits hold the codeword size. + /// + /// + /// Code value occupies 24 most significant bits as integer value. + /// This value is shifted to the MSB position for performance reasons. + /// For example, decimal value 10 is stored like this: + /// + /// MSB LSB + /// 1010 0000 00000000 00000000 | 00000100 + /// + /// This was done to eliminate extra binary shifts in the encoder. + /// While code length is represented as 8 bit integer value + /// + /// internal readonly struct HuffmanLut { /// From 24bf7c111d9e7e3fbdae1c1f5002e0735bdddd20 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 13 Sep 2021 01:20:54 +0300 Subject: [PATCH 36/56] Restored sandbox --- .../Program.cs | 81 ++----------------- 1 file changed, 8 insertions(+), 73 deletions(-) diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs index 7f1817e5d..51d616fc7 100644 --- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs +++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs @@ -1,10 +1,4 @@ -// Copyright (c) Six Labors. -// Licensed under the Apache License, Version 2.0. - using System; -using System.Diagnostics; -using System.IO; -using SixLabors.ImageSharp.Formats.Jpeg; using SixLabors.ImageSharp.Tests.Formats.Jpg; using SixLabors.ImageSharp.Tests.PixelFormats.PixelOperations; using SixLabors.ImageSharp.Tests.ProfilingBenchmarks; @@ -34,73 +28,14 @@ namespace SixLabors.ImageSharp.Tests.ProfilingSandbox /// public static void Main(string[] args) { - BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCrRatio444); - BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCrRatio444); - BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCrRatio444); - BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCrRatio444); - - //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.YCbCrRatio420); - //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.YCbCrRatio420); - //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.YCbCrRatio420); - //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.YCbCrRatio420); - - //BenchmarkEncoder("snow_main", 200, 100, JpegColorType.Luminance); - //BenchmarkEncoder("snow_main", 200, 90, JpegColorType.Luminance); - //BenchmarkEncoder("snow_main", 200, 75, JpegColorType.Luminance); - //BenchmarkEncoder("snow_main", 200, 50, JpegColorType.Luminance); - - //ReEncodeImage("snow_main", 100); - //ReEncodeImage("snow_main", 90); - //ReEncodeImage("snow_main", 75); - //ReEncodeImage("snow_main", 50); - - Console.WriteLine("Done."); - } - - const string pathTemplate = "C:\\Users\\pl4nu\\Downloads\\{0}.jpg"; - - private static void BenchmarkEncoder(string fileName, int iterations, int quality, JpegColorType color) - { - string loadPath = String.Format(pathTemplate, fileName); - - using var inputStream = new FileStream(loadPath, FileMode.Open); - using var saveStream = new MemoryStream(); - - var decoder = new JpegDecoder { IgnoreMetadata = true }; - using Image img = decoder.Decode(Configuration.Default, inputStream); - - var encoder = new JpegEncoder() - { - Quality = quality, - ColorType = color - }; - - Stopwatch sw = new Stopwatch(); - sw.Start(); - for (int i = 0; i < iterations; i++) - { - img.SaveAsJpeg(saveStream, encoder); - saveStream.Position = 0; - } - sw.Stop(); - - Console.WriteLine($"// Encoding q={quality} | color={color}\n" + - $"// Elapsed: {sw.ElapsedMilliseconds}ms across {iterations} iterations\n" + - $"// Average: {(double)sw.ElapsedMilliseconds / iterations}ms"); - } - - private static void ReEncodeImage(string fileName, int quality) - { - string loadPath = String.Format(pathTemplate, fileName); - using Image img = Image.Load(loadPath); - - string savePath = String.Format(pathTemplate, $"q{quality}_test_{fileName}"); - var encoder = new JpegEncoder() - { - Quality = quality, - ColorType = JpegColorType.YCbCrRatio444 - }; - img.SaveAsJpeg(savePath, encoder); + LoadResizeSaveParallelMemoryStress.Run(); + // RunJpegEncoderProfilingTests(); + // RunJpegColorProfilingTests(); + // RunDecodeJpegProfilingTests(); + // RunToVector4ProfilingTest(); + // RunResizeProfilingTest(); + + // Console.ReadLine(); } private static void RunJpegEncoderProfilingTests() From 4fd912b9dd84f6a5c8774f110d719f188488f55f Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 13 Sep 2021 09:21:35 +0300 Subject: [PATCH 37/56] Fixed Ssse3 zig-zag implementation --- .../Formats/Jpeg/Components/Block8x8F.cs | 4 +- .../Jpeg/Components/ZigZag.Intrinsic.cs | 228 ++++++++++-------- .../Formats/Jpg/Block8x8FTests.cs | 49 ++-- .../Formats/Jpg/Utils/JpegFixture.cs | 32 +++ .../FeatureTesting/FeatureTestRunner.cs | 46 ++++ 5 files changed, 241 insertions(+), 118 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index d93375f39..24177c556 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -414,12 +414,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components if (Avx2.IsSupported) { MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest); - ZigZag.ApplyZigZagOrderingAvx(ref dest, ref dest); + ZigZag.ApplyZigZagOrderingAvx(ref dest); } else if (Ssse3.IsSupported) { MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest); - ZigZag.ApplyZigZagOrderingSse(ref dest, ref dest); + ZigZag.ApplyZigZagOrderingSse(ref dest); } else #endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs index abe02d040..eb15c8b55 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -21,6 +21,47 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Gets shuffle vectors for /// zig zag implementation. /// + private static ReadOnlySpan SseShuffleMasks1 => new byte[] + { + // row0 + 0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _, + _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5, + _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, + + // row1 + _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11, + 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _, + _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _, + + // row2 + _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, + _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, + + // row3 + _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _, + _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _, + _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, + 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, + + // row4 + _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _, + _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, + _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _, + + // row5 + _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, + 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _, + + // row6 + _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _, + _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13, + 4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, + + // row7 + 10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, + _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15 + }; + private static ReadOnlySpan SseShuffleMasks => new byte[] { // row0 @@ -56,7 +97,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components // row4 // E F G H - // 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, + 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _, @@ -152,112 +193,99 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// Requires Ssse3 support. /// - /// Input matrix. + /// Input matrix. /// Matrix to store the result. Can be a reference to input matrix. - public static unsafe void ApplyZigZagOrderingSse(ref Block8x8 source, ref Block8x8 dest) + public static unsafe void ApplyZigZagOrderingSse(ref Block8x8 block) { DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); - fixed (byte* maskPtr = SseShuffleMasks) + fixed (byte* maskPtr = SseShuffleMasks1) { - Vector128 rowA = source.V0.AsByte(); - Vector128 rowB = source.V1.AsByte(); - Vector128 rowC = source.V2.AsByte(); - Vector128 rowD = source.V3.AsByte(); - Vector128 rowE = source.V4.AsByte(); - Vector128 rowF = source.V5.AsByte(); - Vector128 rowG = source.V6.AsByte(); - Vector128 rowH = source.V7.AsByte(); - - // row0 - Vector128 row0A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (0 * 16))).AsInt16(); - Vector128 row0B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (1 * 16))).AsInt16(); - Vector128 row0 = Sse2.Or(row0A, row0B); - Vector128 row0C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (2 * 16))).AsInt16(); - row0 = Sse2.Or(row0, row0C); - - // row1 - Vector128 row1A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (3 * 16))).AsInt16(); - Vector128 row1B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (4 * 16))).AsInt16(); - Vector128 row1 = Sse2.Or(row1A, row1B); - Vector128 row1C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (5 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1C); - Vector128 row1D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (6 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1D); - Vector128 row1E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (7 * 16))).AsInt16(); - row1 = Sse2.Or(row1, row1E); + Vector128 rowA = block.V0.AsByte(); + Vector128 rowB = block.V1.AsByte(); + Vector128 rowC = block.V2.AsByte(); + Vector128 rowD = block.V3.AsByte(); + Vector128 rowE = block.V4.AsByte(); + Vector128 rowF = block.V5.AsByte(); + Vector128 rowG = block.V6.AsByte(); + Vector128 rowH = block.V7.AsByte(); + + // row0 - A0 A1 B0 C0 B1 A2 A3 B2 + Vector128 rowA0 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 0))).AsInt16(); + Vector128 rowB0 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 1))).AsInt16(); + Vector128 row0 = Sse2.Or(rowA0, rowB0); + Vector128 rowC0 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 2))).AsInt16(); + row0 = Sse2.Or(row0, rowC0); + + // row1 - C1 D0 E0 D1 C2 B3 A4 A5 + Vector128 rowA1 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 3))).AsInt16(); + Vector128 rowC1 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 4))).AsInt16(); + Vector128 row1 = Sse2.Or(rowA1, rowC1); + Vector128 rowD1 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 5))).AsInt16(); + row1 = Sse2.Or(row1, rowD1); + row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 3), 5).AsInt16(); + row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 2).AsInt16(); // row2 - Vector128 row2B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (8 * 16))).AsInt16(); - Vector128 row2C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (9 * 16))).AsInt16(); - Vector128 row2 = Sse2.Or(row2B, row2C); - Vector128 row2D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (10 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2D); - Vector128 row2E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (11 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2E); - Vector128 row2F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (12 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2F); - Vector128 row2G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (13 * 16))).AsInt16(); - row2 = Sse2.Or(row2, row2G); + Vector128 rowE2 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 6))).AsInt16(); + Vector128 rowF2 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 7))).AsInt16(); + Vector128 row2 = Sse2.Or(rowE2, rowF2); + row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 4), 0).AsInt16(); + row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 3), 1).AsInt16(); + row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 2).AsInt16(); + row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 0), 5).AsInt16(); // row3 - Vector128 row3A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (14 * 16))).AsInt16().AsInt16(); - Vector128 row3B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (15 * 16))).AsInt16().AsInt16(); - Vector128 row3 = Sse2.Or(row3A, row3B); - Vector128 row3C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); - row3 = Sse2.Or(row3, row3C); - Vector128 row3D_row4E_shuffleMask = Sse2.LoadVector128(maskPtr + (17 * 16)); - Vector128 row3D = Ssse3.Shuffle(rowD, row3D_row4E_shuffleMask).AsInt16(); - row3 = Sse2.Or(row3, row3D); + Vector128 rowA3 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 8))).AsInt16().AsInt16(); + Vector128 rowB3 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 9))).AsInt16().AsInt16(); + Vector128 row3 = Sse2.Or(rowA3, rowB3); + Vector128 rowC3 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 10))).AsInt16(); + row3 = Sse2.Or(row3, rowC3); + Vector128 shuffleRowD3EF = Sse2.LoadVector128(maskPtr + (16 * 11)); + Vector128 rowD3 = Ssse3.Shuffle(rowD, shuffleRowD3EF).AsInt16(); + row3 = Sse2.Or(row3, rowD3); // row4 - Vector128 row4E = Ssse3.Shuffle(rowE, row3D_row4E_shuffleMask).AsInt16(); - Vector128 row4F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (18 * 16))).AsInt16(); - Vector128 row4 = Sse2.Or(row4E, row4F); - Vector128 row4G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (19 * 16))).AsInt16(); - row4 = Sse2.Or(row4, row4G); - Vector128 row4H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (20 * 16))).AsInt16(); - row4 = Sse2.Or(row4, row4H); + Vector128 rowE4 = Ssse3.Shuffle(rowE, shuffleRowD3EF).AsInt16(); + Vector128 rowF4 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 12))).AsInt16(); + Vector128 row4 = Sse2.Or(rowE4, rowF4); + Vector128 rowG4 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 13))).AsInt16(); + row4 = Sse2.Or(row4, rowG4); + Vector128 rowH4 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 14))).AsInt16(); + row4 = Sse2.Or(row4, rowH4); // row5 - Vector128 row5B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (21 * 16))).AsInt16(); - Vector128 row5C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (22 * 16))).AsInt16(); - Vector128 row5 = Sse2.Or(row5B, row5C); - Vector128 row5D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (23 * 16))).AsInt16(); - row5 = Sse2.Or(row5, row5D); - Vector128 row5E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (24 * 16))).AsInt16(); - row5 = Sse2.Or(row5, row5E); - Vector128 row5F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (25 * 16))).AsInt16(); - row5 = Sse2.Or(row5, row5F); - Vector128 row5G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (26 * 16))).AsInt16(); - row5 = Sse2.Or(row5, row5G); + Vector128 rowC5 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 15))).AsInt16(); + Vector128 rowD5 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); + Vector128 row5 = Sse2.Or(rowC5, rowD5); + row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 7), 2).AsInt16(); + row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 5).AsInt16(); + row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 4), 6).AsInt16(); + row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 3), 7).AsInt16(); // row6 - Vector128 row6D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (27 * 16))).AsInt16(); - Vector128 row6E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (28 * 16))).AsInt16(); - Vector128 row6 = Sse2.Or(row6D, row6E); - Vector128 row6F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (29 * 16))).AsInt16(); - row6 = Sse2.Or(row6, row6F); - Vector128 row6G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (30 * 16))).AsInt16(); - row6 = Sse2.Or(row6, row6G); - Vector128 row6H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (31 * 16))).AsInt16(); - row6 = Sse2.Or(row6, row6H); + Vector128 rowE6 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 17))).AsInt16(); + Vector128 rowF6 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 18))).AsInt16(); + Vector128 row6 = Sse2.Or(rowE6, rowF6); + Vector128 rowH6 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 19))).AsInt16(); + row6 = Sse2.Or(row6, rowH6); + row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 5).AsInt16(); + row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 4), 2).AsInt16(); // row7 - Vector128 row7F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (32 * 16))).AsInt16(); - Vector128 row7G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (33 * 16))).AsInt16(); - Vector128 row7 = Sse2.Or(row7F, row7G); - Vector128 row7H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (35 * 16))).AsInt16(); - row7 = Sse2.Or(row7, row7H); - - dest.V0 = row0; - dest.V1 = row1; - dest.V2 = row2; - dest.V3 = row3; - dest.V4 = row4; - dest.V5 = row5; - dest.V6 = row6; - dest.V7 = row7; + Vector128 rowG7 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 20))).AsInt16(); + Vector128 rowH7 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 21))).AsInt16(); + Vector128 row7 = Sse2.Or(rowG7, rowH7); + row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 7), 4).AsInt16(); + + block.V0 = row0; + block.V1 = row1; + block.V2 = row2; + block.V3 = row3; + block.V4 = row4; + block.V5 = row5; + block.V6 = row6; + block.V7 = row7; } } @@ -267,18 +295,18 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// Requires Avx2 support. /// - /// Input matrix. + /// Input matrix. /// Matrix to store the result. Can be a reference to input matrix. - public static unsafe void ApplyZigZagOrderingAvx(ref Block8x8 source, ref Block8x8 dest) + public static unsafe void ApplyZigZagOrderingAvx(ref Block8x8 block) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); fixed (byte* shuffleVectorsPtr = AvxShuffleMasks) { - Vector256 rowsAB = source.V01.AsByte(); - Vector256 rowsCD = source.V23.AsByte(); - Vector256 rowsEF = source.V45.AsByte(); - Vector256 rowsGH = source.V67.AsByte(); + Vector256 rowsAB = block.V01.AsByte(); + Vector256 rowsCD = block.V23.AsByte(); + Vector256 rowsEF = block.V45.AsByte(); + Vector256 rowsGH = block.V67.AsByte(); // rows 0 1 Vector256 rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); @@ -333,10 +361,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components Vector256 row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH); - dest.V01 = row01.AsInt16(); - dest.V23 = row23.AsInt16(); - dest.V45 = row45.AsInt16(); - dest.V67 = row67.AsInt16(); + block.V01 = row01.AsInt16(); + block.V23 = row23.AsInt16(); + block.V45 = row45.AsInt16(); + block.V67 = row67.AsInt16(); } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index 89ef74d8b..40e42acb3 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -4,7 +4,9 @@ // Uncomment this to turn unit tests into benchmarks: // #define BENCHMARKING using System; -using System.Diagnostics; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics.X86; +#endif using SixLabors.ImageSharp.Formats.Jpeg.Components; using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils; @@ -247,30 +249,45 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg this.CompareBlocks(expected, actual, 0); } - // TODO: intrinsic tests [Theory] [InlineData(1, 2)] [InlineData(2, 1)] public void Quantize(int srcSeed, int qtSeed) { - Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed); - Block8x8F quant = CreateRandomFloatBlock(-2000, 2000, qtSeed); + static void RunTest(string srcSeedSerialized, string qtSeedSerialized) + { + int srcSeed = FeatureTestRunner.Deserialize(srcSeedSerialized); + int qtSeed = FeatureTestRunner.Deserialize(qtSeedSerialized); - // Reference implementation quantizes given block via division - Block8x8 expected = default; - ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder); + Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed); - // Actual current implementation quantizes given block via multiplication - // With quantization table reciprocal - for (int i = 0; i < Block8x8F.Size; i++) - { - quant[i] = 1f / quant[i]; - } + // Quantization code is used only in jpeg where it's guaranteed that + // qunatization valus are greater than 1 + // Quantize method supports negative numbers by very small numbers can cause troubles + Block8x8F quant = CreateRandomFloatBlock(1, 2000, qtSeed); + + // Reference implementation quantizes given block via division + Block8x8 expected = default; + ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder); + + // Actual current implementation quantizes given block via multiplication + // With quantization table reciprocal + for (int i = 0; i < Block8x8F.Size; i++) + { + quant[i] = 1f / quant[i]; + } - Block8x8 actual = default; - Block8x8F.Quantize(ref source, ref actual, ref quant); + Block8x8 actual = default; + Block8x8F.Quantize(ref source, ref actual, ref quant); - this.CompareBlocks(expected, actual, 1); + Assert.True(CompareBlocks(expected, actual, 1, out int diff), $"Blocks are not equal, diff={diff}"); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + srcSeed, + qtSeed, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); } [Fact] diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs index ccb7f6f1e..1cf9bc4ae 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs @@ -190,6 +190,38 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils Assert.False(failed); } + internal static bool CompareBlocks(Block8x8 a, Block8x8 b, int tolerance, out int diff) + { + bool res = CompareBlocks(a.AsFloatBlock(), b.AsFloatBlock(), tolerance + 1e-5f, out float fdiff); + diff = (int)fdiff; + return res; + } + + internal static bool CompareBlocks(Block8x8F a, Block8x8F b, float tolerance, out float diff) => + CompareBlocks(a.ToArray(), b.ToArray(), tolerance, out diff); + + internal static bool CompareBlocks(Span a, Span b, float tolerance, out float diff) + { + var comparer = new ApproximateFloatComparer(tolerance); + bool failed = false; + + diff = 0; + + for (int i = 0; i < 64; i++) + { + float expected = a[i]; + float actual = b[i]; + diff += Math.Abs(expected - actual); + + if (!comparer.Equals(expected, actual)) + { + failed = true; + } + } + + return !failed; + } + internal static JpegDecoderCore ParseJpegStream(string testFileName, bool metaDataOnly = false) { byte[] bytes = TestFile.Create(testFileName).Bytes; diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs index fa0f02ca1..0d2f3fcef 100644 --- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs +++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs @@ -301,6 +301,52 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities } } + /// + /// Runs the given test within an environment + /// where the given features. + /// + /// The test action to run. + /// The value to pass as a parameter #0 to the test action. + /// The value to pass as a parameter #1 to the test action. + /// The intrinsics features. + public static void RunWithHwIntrinsicsFeature( + Action action, + T arg0, + T arg1, + HwIntrinsics intrinsics) + where T : IConvertible + { + if (!RemoteExecutor.IsSupported) + { + return; + } + + foreach (KeyValuePair intrinsic in intrinsics.ToFeatureKeyValueCollection()) + { + var processStartInfo = new ProcessStartInfo(); + if (intrinsic.Key != HwIntrinsics.AllowAll) + { + processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0"; + + RemoteExecutor.Invoke( + action, + arg0.ToString(), + arg1.ToString(), + new RemoteInvokeOptions + { + StartInfo = processStartInfo + }) + .Dispose(); + } + else + { + // Since we are running using the default architecture there is no + // point creating the overhead of running the action in a separate process. + action(arg0.ToString(), arg1.ToString()); + } + } + } + internal static Dictionary ToFeatureKeyValueCollection(this HwIntrinsics intrinsics) { // Loop through and translate the given values into COMPlus equivaluents From 8cd4c9724c79645a074ee250f2d1739b7464520b Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 13 Sep 2021 09:31:32 +0300 Subject: [PATCH 38/56] Removed debug ssse3 zig-zag shuffle table --- .../Jpeg/Components/ZigZag.Intrinsic.cs | 66 +------------------ 1 file changed, 1 insertion(+), 65 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs index eb15c8b55..01a00180a 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -21,107 +21,43 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Gets shuffle vectors for /// zig zag implementation. /// - private static ReadOnlySpan SseShuffleMasks1 => new byte[] - { - // row0 - 0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _, - _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5, - _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, - - // row1 - _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11, - 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _, - _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _, - - // row2 - _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, - _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, - - // row3 - _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _, - _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _, - _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, - 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, - - // row4 - _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _, - _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, - _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _, - - // row5 - _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, - 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _, - - // row6 - _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _, - _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13, - 4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, - - // row7 - 10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, - _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15 - }; - private static ReadOnlySpan SseShuffleMasks => new byte[] { // row0 - // A B C 0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, // row1 - // A B C D E _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11, - _, _, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _, - _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, // row2 - // B C D E F G - 8, 9, _, _, _, _, _, _, _, _, _, _, _, _, _, _, - _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, - _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, - _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, // row3 - // A B C D - // D shuffle mask is the for row4 E row shuffle mask _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, // row4 - // E F G H - 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _, _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _, // row5 - // B C D E F G - _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _, - _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, - _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, _, _, - _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, // row6 - // D E F G H - _, _, _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13, - _, _, _, _, 8, 9, _, _, _, _, _, _, _, _, _, _, 4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, // row7 - // F G H - _, _, _, _, _, _, _, _, 14, 15, _, _, _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15 }; @@ -199,7 +135,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); - fixed (byte* maskPtr = SseShuffleMasks1) + fixed (byte* maskPtr = SseShuffleMasks) { Vector128 rowA = block.V0.AsByte(); Vector128 rowB = block.V1.AsByte(); From c6c9f2beefba2f21c609328930b42a06be86be42 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Mon, 13 Sep 2021 09:38:26 +0300 Subject: [PATCH 39/56] Fixed docs --- src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs index 01a00180a..6fa776e2a 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -130,7 +130,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Requires Ssse3 support. /// /// Input matrix. - /// Matrix to store the result. Can be a reference to input matrix. public static unsafe void ApplyZigZagOrderingSse(ref Block8x8 block) { DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); @@ -232,7 +231,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Requires Avx2 support. /// /// Input matrix. - /// Matrix to store the result. Can be a reference to input matrix. public static unsafe void ApplyZigZagOrderingAvx(ref Block8x8 block) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); From 6b3f0f7bd9d838b47b97aa676cbd6b3253dabb14 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 14 Sep 2021 01:12:39 +0300 Subject: [PATCH 40/56] gfoidl fixes --- .../Formats/Jpeg/Components/Block8x8.cs | 10 +++++----- .../Jpeg/Components/Block8x8F.Intrinsic.cs | 6 +++--- .../Formats/Jpeg/Components/Block8x8F.cs | 4 ++-- .../Components/Encoder/HuffmanScanEncoder.cs | 9 +++++---- .../Jpeg/Components/FastFloatingPointDCT.cs | 4 ++-- .../Formats/Jpeg/Components/ZigZag.Intrinsic.cs | 14 ++++---------- .../Formats/Jpg/Block8x8Tests.cs | 16 ++++++++-------- 7 files changed, 29 insertions(+), 34 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index 71077675d..9cefedc1d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -172,7 +172,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components public static Block8x8 Load(Span data) { - Block8x8 result = default; + Unsafe.SkipInit(out Block8x8 result); result.LoadFrom(data); return result; } @@ -204,7 +204,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { ref byte selfRef = ref Unsafe.As(ref this); ref byte destRef = ref MemoryMarshal.GetReference(MemoryMarshal.Cast(destination)); - Unsafe.CopyBlock(ref destRef, ref selfRef, Size * sizeof(short)); + Unsafe.CopyBlockUnaligned(ref destRef, ref selfRef, Size * sizeof(short)); } /// @@ -287,7 +287,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Index of the last non-zero element. Returns -1 if all elements are equal to zero. /// [MethodImpl(InliningOptions.ShortMethod)] - public int GetLastNonZeroIndex() + public nint GetLastNonZeroIndex() { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported) @@ -298,7 +298,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components ref Vector256 mcuStride = ref Unsafe.As>(ref this); - for (int i = 3; i >= 0; i--) + for (nint i = 3; i >= 0; i--) { int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i), zero16).AsByte()); @@ -325,7 +325,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components else #endif { - int index = Size - 1; + nint index = Size - 1; ref short elemRef = ref Unsafe.As(ref this); while (index >= 0 && Unsafe.Add(ref elemRef, index) == 0) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs index 733d32892..e78802472 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -46,7 +46,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components ref Vector256 destRef = ref dest.V01; - for (int i = 0; i < 8; i += 2) + for (nint i = 0; i < 8; i += 2) { Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); @@ -54,7 +54,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components Vector256 row = Avx2.PackSignedSaturate(row0, row1); row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16(); - Unsafe.Add(ref destRef, i / 2) = row; + Unsafe.Add(ref destRef, (IntPtr)((uint)i / 2)) = row; } } @@ -73,7 +73,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components Vector128 right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); Vector128 row = Sse2.PackSignedSaturate(left, right); - Unsafe.Add(ref destBase, i / 2) = row; + Unsafe.Add(ref destBase, (IntPtr)((uint)i / 2)) = row; } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 24177c556..986af3417 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -414,12 +414,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components if (Avx2.IsSupported) { MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest); - ZigZag.ApplyZigZagOrderingAvx(ref dest); + ZigZag.ApplyZigZagOrderingAvx2(ref dest); } else if (Ssse3.IsSupported) { MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest); - ZigZag.ApplyZigZagOrderingSse(ref dest); + ZigZag.ApplyZigZagOrderingSsse3(ref dest); } else #endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 3e6b0e5f4..35e0e2648 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -115,7 +115,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder private bool IsFlushNeeded { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => this.emitWriteIndex < this.emitBuffer.Length / 2; + get => this.emitWriteIndex < (uint)this.emitBuffer.Length / 2; } /// @@ -408,15 +408,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // Emit the AC components. int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values; - int lastValuableIndex = spectralBlock.GetLastNonZeroIndex(); + nint lastValuableIndex = spectralBlock.GetLastNonZeroIndex(); int runLength = 0; - for (int zig = 1; zig <= lastValuableIndex; zig++) + ref short blockRef = ref Unsafe.As(ref spectralBlock); + for (nint zig = 1; zig <= lastValuableIndex; zig++) { const int zeroRun1 = 1 << 4; const int zeroRun16 = 16 << 4; - int ac = spectralBlock[zig]; + int ac = Unsafe.Add(ref blockRef, zig); if (ac == 0) { runLength += zeroRun1; diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 1c5cfc8d6..4f7db7c59 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -68,7 +68,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Values are also scaled by 8 so DCT code won't do unnecessary division. /// /// - public static ReadOnlySpan DctReciprocalAdjustmentCoefficients => new float[] + public static readonly float[] DctReciprocalAdjustmentCoefficients = new float[] { 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, @@ -104,7 +104,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components public static void TransformFDCT(ref Block8x8F block) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported || Sse.IsSupported) + if (Sse.IsSupported) { ForwardTransformSimd(ref block); } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs index 6fa776e2a..6577739c1 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -18,7 +18,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components #pragma warning restore SA1309 /// - /// Gets shuffle vectors for + /// Gets shuffle vectors for /// zig zag implementation. /// private static ReadOnlySpan SseShuffleMasks => new byte[] @@ -63,7 +63,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components }; /// - /// Gets shuffle vectors for + /// Gets shuffle vectors for /// zig zag implementation. /// private static ReadOnlySpan AvxShuffleMasks => new byte[] @@ -126,11 +126,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics. /// - /// - /// Requires Ssse3 support. - /// /// Input matrix. - public static unsafe void ApplyZigZagOrderingSse(ref Block8x8 block) + public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block) { DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); @@ -227,11 +224,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics. /// - /// - /// Requires Avx2 support. - /// /// Input matrix. - public static unsafe void ApplyZigZagOrderingAvx(ref Block8x8 block) + public static unsafe void ApplyZigZagOrderingAvx2(ref Block8x8 block) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs index 69375ae1b..3737cce80 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs @@ -130,9 +130,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg { Block8x8 data = default; - int expected = -1; + nint expected = -1; - int actual = data.GetLastNonZeroIndex(); + nint actual = data.GetLastNonZeroIndex(); Assert.Equal(expected, actual); } @@ -153,9 +153,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg data[i] = 10; } - int expected = Block8x8.Size - 1; + nint expected = Block8x8.Size - 1; - int actual = data.GetLastNonZeroIndex(); + nint actual = data.GetLastNonZeroIndex(); Assert.Equal(expected, actual); } @@ -182,9 +182,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int setIndex = rng.Next(1, Block8x8.Size); data[setIndex] = (short)rng.Next(-2000, 2000); - int expected = setIndex; + nint expected = setIndex; - int actual = data.GetLastNonZeroIndex(); + nint actual = data.GetLastNonZeroIndex(); Assert.Equal(expected, actual); } @@ -219,7 +219,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expected = lastIndex; - int actual = data.GetLastNonZeroIndex(); + nint actual = data.GetLastNonZeroIndex(); Assert.Equal(expected, actual); } @@ -265,7 +265,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg int expected = secondChunkEnd; - int actual = data.GetLastNonZeroIndex(); + nint actual = data.GetLastNonZeroIndex(); Assert.True(expected == actual, $"Expected: {expected}\nActual: {actual}\nInput matrix: {data}"); } From d934bad69e554517df55c204a7e7f482f58ddef4 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Fri, 17 Sep 2021 05:01:12 +0300 Subject: [PATCH 41/56] gfoidl fixes --- src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index 9cefedc1d..9d49b8c45 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -225,10 +225,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components [MethodImpl(InliningOptions.ShortMethod)] public void LoadFrom(Span source) { - ref byte s = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); - ref byte d = ref Unsafe.As(ref this); + ref byte sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref byte destRef = ref Unsafe.As(ref this); - Unsafe.CopyBlock(ref d, ref s, Size * sizeof(short)); + Unsafe.CopyBlockUnaligned(ref destRef, ref sourceRef, Size * sizeof(short)); } /// From f7bc8d77479781899924afa2f28773fe61ec48ce Mon Sep 17 00:00:00 2001 From: Gerard Gunnewijk Date: Wed, 22 Sep 2021 18:23:07 +0200 Subject: [PATCH 42/56] Added test image & test method --- ImageSharp.sln | 4 ++-- .../Formats/Png/PngDecoderTests.cs | 18 ++++++++++++++++++ tests/ImageSharp.Tests/TestImages.cs | 3 +++ tests/Images/Input/Png/issues/Issue_1765.png | 3 +++ 4 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 tests/Images/Input/Png/issues/Issue_1765.png diff --git a/ImageSharp.sln b/ImageSharp.sln index bf1f3579c..c71ec11d7 100644 --- a/ImageSharp.sln +++ b/ImageSharp.sln @@ -1,6 +1,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 16 -VisualStudioVersion = 16.0.28902.138 +# Visual Studio Version 17 +VisualStudioVersion = 17.0.31710.8 MinimumVisualStudioVersion = 10.0.40219.1 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "_root", "_root", "{C317F1B1-D75E-4C6D-83EB-80367343E0D7}" ProjectSection(SolutionItems) = preProject diff --git a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs index 9832aeb7b..a517c4a4a 100644 --- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs @@ -368,6 +368,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Png Assert.Null(ex); } + // https://github.com/SixLabors/ImageSharp/issues/1765 + [Theory] + [WithFile(TestImages.Png.Issue1765, PixelTypes.Rgba32)] + public void Issue1765(TestImageProvider provider) + where TPixel : unmanaged, IPixel + { + System.Exception ex = Record.Exception( + () => + { + using (Image image = provider.GetImage(PngDecoder)) + { + image.DebugSave(provider); + image.CompareToOriginal(provider, ImageComparer.Exact); + } + }); + Assert.Null(ex); + } + // https://github.com/SixLabors/ImageSharp/issues/410 [Theory] [WithFile(TestImages.Png.Bad.Issue410_MalformedApplePng, PixelTypes.Rgba32)] diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs index d1a6624af..ee85029ce 100644 --- a/tests/ImageSharp.Tests/TestImages.cs +++ b/tests/ImageSharp.Tests/TestImages.cs @@ -111,6 +111,9 @@ namespace SixLabors.ImageSharp.Tests // Issue 935: https://github.com/SixLabors/ImageSharp/issues/935 public const string Issue935 = "Png/issues/Issue_935.png"; + // Issue 1765: https://github.com/SixLabors/ImageSharp/issues/1765 + public const string Issue1765 = "png/issues/Issue_1765.png"; + public static class Bad { public const string MissingDataChunk = "Png/xdtn0g01.png"; diff --git a/tests/Images/Input/Png/issues/Issue_1765.png b/tests/Images/Input/Png/issues/Issue_1765.png new file mode 100644 index 000000000..c9705550f --- /dev/null +++ b/tests/Images/Input/Png/issues/Issue_1765.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86ea14567bcd259d76dc782ee366c23a5755714c6d48f636524b23e75b89e5b6 +size 775275 From c1e8c15b88296c01d4c2976c949b4442b7bb73ad Mon Sep 17 00:00:00 2001 From: Gerard Gunnewijk Date: Wed, 22 Sep 2021 18:23:07 +0200 Subject: [PATCH 43/56] Added test image & test method --- ImageSharp.sln | 5 +++-- .../Formats/Png/PngDecoderTests.cs | 18 ++++++++++++++++++ tests/ImageSharp.Tests/TestImages.cs | 3 +++ tests/Images/Input/Png/issues/Issue_1765.png | 3 +++ 4 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 tests/Images/Input/Png/issues/Issue_1765.png diff --git a/ImageSharp.sln b/ImageSharp.sln index bf1f3579c..b6f3b5a0f 100644 --- a/ImageSharp.sln +++ b/ImageSharp.sln @@ -1,6 +1,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 16 -VisualStudioVersion = 16.0.28902.138 +# Visual Studio Version 17 +VisualStudioVersion = 17.0.31710.8 MinimumVisualStudioVersion = 10.0.40219.1 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "_root", "_root", "{C317F1B1-D75E-4C6D-83EB-80367343E0D7}" ProjectSection(SolutionItems) = preProject @@ -403,6 +403,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "issues", "issues", "{670DD4 tests\Images\Input\Png\issues\Issue_1127.png = tests\Images\Input\Png\issues\Issue_1127.png tests\Images\Input\Png\issues\Issue_1177_1.png = tests\Images\Input\Png\issues\Issue_1177_1.png tests\Images\Input\Png\issues\Issue_1177_2.png = tests\Images\Input\Png\issues\Issue_1177_2.png + tests\Images\Input\Png\issues\Issue_1765.png = tests\Images\Input\Png\issues\Issue_1765.png tests\Images\Input\Png\issues\Issue_410.png = tests\Images\Input\Png\issues\Issue_410.png tests\Images\Input\Png\issues\Issue_935.png = tests\Images\Input\Png\issues\Issue_935.png EndProjectSection diff --git a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs index 9832aeb7b..a517c4a4a 100644 --- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs @@ -368,6 +368,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Png Assert.Null(ex); } + // https://github.com/SixLabors/ImageSharp/issues/1765 + [Theory] + [WithFile(TestImages.Png.Issue1765, PixelTypes.Rgba32)] + public void Issue1765(TestImageProvider provider) + where TPixel : unmanaged, IPixel + { + System.Exception ex = Record.Exception( + () => + { + using (Image image = provider.GetImage(PngDecoder)) + { + image.DebugSave(provider); + image.CompareToOriginal(provider, ImageComparer.Exact); + } + }); + Assert.Null(ex); + } + // https://github.com/SixLabors/ImageSharp/issues/410 [Theory] [WithFile(TestImages.Png.Bad.Issue410_MalformedApplePng, PixelTypes.Rgba32)] diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs index d1a6624af..ee85029ce 100644 --- a/tests/ImageSharp.Tests/TestImages.cs +++ b/tests/ImageSharp.Tests/TestImages.cs @@ -111,6 +111,9 @@ namespace SixLabors.ImageSharp.Tests // Issue 935: https://github.com/SixLabors/ImageSharp/issues/935 public const string Issue935 = "Png/issues/Issue_935.png"; + // Issue 1765: https://github.com/SixLabors/ImageSharp/issues/1765 + public const string Issue1765 = "png/issues/Issue_1765.png"; + public static class Bad { public const string MissingDataChunk = "Png/xdtn0g01.png"; diff --git a/tests/Images/Input/Png/issues/Issue_1765.png b/tests/Images/Input/Png/issues/Issue_1765.png new file mode 100644 index 000000000..c9705550f --- /dev/null +++ b/tests/Images/Input/Png/issues/Issue_1765.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86ea14567bcd259d76dc782ee366c23a5755714c6d48f636524b23e75b89e5b6 +size 775275 From 7b7ee4a9fb16e27b556061d87e38e64d5f583758 Mon Sep 17 00:00:00 2001 From: Gerard Gunnewijk Date: Wed, 22 Sep 2021 18:27:41 +0200 Subject: [PATCH 44/56] Reverted sln version change --- ImageSharp.sln | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ImageSharp.sln b/ImageSharp.sln index b6f3b5a0f..6ae369f2d 100644 --- a/ImageSharp.sln +++ b/ImageSharp.sln @@ -1,6 +1,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 17 -VisualStudioVersion = 17.0.31710.8 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.28902.138 MinimumVisualStudioVersion = 10.0.40219.1 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "_root", "_root", "{C317F1B1-D75E-4C6D-83EB-80367343E0D7}" ProjectSection(SolutionItems) = preProject From c967e3653ad1b1f9dd1a54a713e7e0d6709c637d Mon Sep 17 00:00:00 2001 From: Gerard Gunnewijk Date: Wed, 22 Sep 2021 18:39:40 +0200 Subject: [PATCH 45/56] Was it a capital letter issue? --- tests/ImageSharp.Tests/TestImages.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs index ee85029ce..fb6cc7b67 100644 --- a/tests/ImageSharp.Tests/TestImages.cs +++ b/tests/ImageSharp.Tests/TestImages.cs @@ -112,7 +112,7 @@ namespace SixLabors.ImageSharp.Tests public const string Issue935 = "Png/issues/Issue_935.png"; // Issue 1765: https://github.com/SixLabors/ImageSharp/issues/1765 - public const string Issue1765 = "png/issues/Issue_1765.png"; + public const string Issue1765 = "Png/issues/Issue_1765.png"; public static class Bad { From b055e8b14bb9e75d8093fb99b6b2cba24b873495 Mon Sep 17 00:00:00 2001 From: Gerard Gunnewijk Date: Thu, 23 Sep 2021 17:02:55 +0200 Subject: [PATCH 46/56] Renamed the file and file reference --- ImageSharp.sln | 2 +- tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs | 2 +- tests/ImageSharp.Tests/TestImages.cs | 2 +- .../{Issue_1765.png => Issue_1765_Net6DeflateStreamRead.png} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename tests/Images/Input/Png/issues/{Issue_1765.png => Issue_1765_Net6DeflateStreamRead.png} (100%) diff --git a/ImageSharp.sln b/ImageSharp.sln index 6ae369f2d..c433d22f5 100644 --- a/ImageSharp.sln +++ b/ImageSharp.sln @@ -403,7 +403,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "issues", "issues", "{670DD4 tests\Images\Input\Png\issues\Issue_1127.png = tests\Images\Input\Png\issues\Issue_1127.png tests\Images\Input\Png\issues\Issue_1177_1.png = tests\Images\Input\Png\issues\Issue_1177_1.png tests\Images\Input\Png\issues\Issue_1177_2.png = tests\Images\Input\Png\issues\Issue_1177_2.png - tests\Images\Input\Png\issues\Issue_1765.png = tests\Images\Input\Png\issues\Issue_1765.png + tests\Images\Input\Png\issues\Issue_1765_Net6DeflateStreamRead.png = tests\Images\Input\Png\issues\Issue_1765_Net6DeflateStreamRead.png tests\Images\Input\Png\issues\Issue_410.png = tests\Images\Input\Png\issues\Issue_410.png tests\Images\Input\Png\issues\Issue_935.png = tests\Images\Input\Png\issues\Issue_935.png EndProjectSection diff --git a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs index a517c4a4a..9fc4d03dd 100644 --- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs @@ -370,7 +370,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Png // https://github.com/SixLabors/ImageSharp/issues/1765 [Theory] - [WithFile(TestImages.Png.Issue1765, PixelTypes.Rgba32)] + [WithFile(TestImages.Png.Issue1765_Net6DeflateStreamRead, PixelTypes.Rgba32)] public void Issue1765(TestImageProvider provider) where TPixel : unmanaged, IPixel { diff --git a/tests/ImageSharp.Tests/TestImages.cs b/tests/ImageSharp.Tests/TestImages.cs index fb6cc7b67..b0a219711 100644 --- a/tests/ImageSharp.Tests/TestImages.cs +++ b/tests/ImageSharp.Tests/TestImages.cs @@ -112,7 +112,7 @@ namespace SixLabors.ImageSharp.Tests public const string Issue935 = "Png/issues/Issue_935.png"; // Issue 1765: https://github.com/SixLabors/ImageSharp/issues/1765 - public const string Issue1765 = "Png/issues/Issue_1765.png"; + public const string Issue1765_Net6DeflateStreamRead = "Png/issues/Issue_1765_Net6DeflateStreamRead.png"; public static class Bad { diff --git a/tests/Images/Input/Png/issues/Issue_1765.png b/tests/Images/Input/Png/issues/Issue_1765_Net6DeflateStreamRead.png similarity index 100% rename from tests/Images/Input/Png/issues/Issue_1765.png rename to tests/Images/Input/Png/issues/Issue_1765_Net6DeflateStreamRead.png From 8d29205076b5bd7265e507a7d3c9a85ae5e410bc Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Thu, 23 Sep 2021 22:29:16 +0300 Subject: [PATCH 47/56] Updated encoder benchmark --- .../Codecs/Jpeg/EncodeJpeg.cs | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs index 508b4b3b0..0e9bed1d9 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs @@ -111,24 +111,24 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg } /* -BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19042 +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042 Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores -.NET Core SDK=6.0.100-preview.3.21202.5 - [Host] : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT - DefaultJob : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT +.NET SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + DefaultJob : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT | Method | Quality | Mean | Error | StdDev | Ratio | |---------------------------- |-------- |---------:|---------:|---------:|------:| -| 'System.Drawing Jpeg 4:2:0' | 75 | 29.41 ms | 0.108 ms | 0.096 ms | 1.00 | -| 'ImageSharp Jpeg 4:2:0' | 75 | 26.30 ms | 0.131 ms | 0.109 ms | 0.89 | -| 'ImageSharp Jpeg 4:4:4' | 75 | 36.70 ms | 0.303 ms | 0.269 ms | 1.25 | +| 'System.Drawing Jpeg 4:2:0' | 75 | 30.04 ms | 0.540 ms | 0.479 ms | 1.00 | +| 'ImageSharp Jpeg 4:2:0' | 75 | 19.32 ms | 0.290 ms | 0.257 ms | 0.64 | +| 'ImageSharp Jpeg 4:4:4' | 75 | 26.76 ms | 0.332 ms | 0.294 ms | 0.89 | | | | | | | | -| 'System.Drawing Jpeg 4:2:0' | 90 | 32.67 ms | 0.226 ms | 0.211 ms | 1.00 | -| 'ImageSharp Jpeg 4:2:0' | 90 | 33.56 ms | 0.237 ms | 0.222 ms | 1.03 | -| 'ImageSharp Jpeg 4:4:4' | 90 | 44.82 ms | 0.250 ms | 0.234 ms | 1.37 | +| 'System.Drawing Jpeg 4:2:0' | 90 | 32.82 ms | 0.184 ms | 0.163 ms | 1.00 | +| 'ImageSharp Jpeg 4:2:0' | 90 | 25.00 ms | 0.408 ms | 0.361 ms | 0.76 | +| 'ImageSharp Jpeg 4:4:4' | 90 | 31.83 ms | 0.636 ms | 0.595 ms | 0.97 | | | | | | | | -| 'System.Drawing Jpeg 4:2:0' | 100 | 39.06 ms | 0.233 ms | 0.218 ms | 1.00 | -| 'ImageSharp Jpeg 4:2:0' | 100 | 40.23 ms | 0.225 ms | 0.277 ms | 1.03 | -| 'ImageSharp Jpeg 4:4:4' | 100 | 63.35 ms | 0.486 ms | 0.431 ms | 1.62 | +| 'System.Drawing Jpeg 4:2:0' | 100 | 39.30 ms | 0.359 ms | 0.318 ms | 1.00 | +| 'ImageSharp Jpeg 4:2:0' | 100 | 34.49 ms | 0.265 ms | 0.235 ms | 0.88 | +| 'ImageSharp Jpeg 4:4:4' | 100 | 56.40 ms | 0.565 ms | 0.501 ms | 1.44 | */ From 6532552b6b8041a7b33f0392476014da29da1208 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 28 Sep 2021 18:50:50 +0300 Subject: [PATCH 48/56] Naming fix & simd else if branch --- .../Jpeg/Components/Block8x8F.Intrinsic.cs | 2 +- .../Formats/Jpeg/Components/Block8x8F.cs | 10 ++++---- .../FastFloatingPointDCT.Intrinsic.cs | 24 +++++++++---------- .../Jpeg/Components/FastFloatingPointDCT.cs | 4 ++-- .../BlockOperations/Block8x8F_Transpose.cs | 4 ++-- .../Formats/Jpg/Block8x8FTests.cs | 4 ++-- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs index e78802472..5a00ccd3d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -77,7 +77,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } } - private void TransposeAvx() + private void Transpose_Avx() { // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 Vector256 r0 = Avx.InsertVector128( diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 986af3417..1d2b19a7b 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -612,25 +612,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Transpose the block inplace. /// [MethodImpl(InliningOptions.ShortMethod)] - public void Transpose() + public void TransposeInplace() { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { - this.TransposeAvx(); + this.Transpose_Avx(); } else #endif { - this.TransposeScalar(); + this.TransposeInplace_Scalar(); } } /// - /// Scalar inplace transpose implementation for + /// Scalar inplace transpose implementation for /// [MethodImpl(InliningOptions.ShortMethod)] - private void TransposeScalar() + private void TransposeInplace_Scalar() { float tmp; int horIndex, verIndex; diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index 7a2b0a78c..0ebe9dbf9 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -45,33 +45,33 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components DebugGuard.IsTrue(Avx.IsSupported || Sse.IsSupported, "Avx or at least Sse support is required to execute this operation."); // First pass - process rows - block.Transpose(); + block.TransposeInplace(); if (Avx.IsSupported) { - FDCT8x8_avx(ref block); + FDCT8x8_Avx(ref block); } - else if (Sse.IsSupported) + else { // Left part - FDCT8x4_sse(ref Unsafe.As>(ref block.V0L)); + FDCT8x4_Sse(ref Unsafe.As>(ref block.V0L)); // Right part - FDCT8x4_sse(ref Unsafe.As>(ref block.V0R)); + FDCT8x4_Sse(ref Unsafe.As>(ref block.V0R)); } // Second pass - process columns - block.Transpose(); + block.TransposeInplace(); if (Avx.IsSupported) { - FDCT8x8_avx(ref block); + FDCT8x8_Avx(ref block); } - else if (Sse.IsSupported) + else { // Left part - FDCT8x4_sse(ref Unsafe.As>(ref block.V0L)); + FDCT8x4_Sse(ref Unsafe.As>(ref block.V0L)); // Right part - FDCT8x4_sse(ref Unsafe.As>(ref block.V0R)); + FDCT8x4_Sse(ref Unsafe.As>(ref block.V0R)); } } @@ -83,7 +83,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Must be called on both 8x4 matrix parts for the full FDCT transform. /// /// Input reference to the first - public static void FDCT8x4_sse(ref Vector128 blockRef) + public static void FDCT8x4_Sse(ref Vector128 blockRef) { DebugGuard.IsTrue(Sse.IsSupported, "Sse support is required to execute this operation."); @@ -135,7 +135,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Requires Avx support. /// /// Input matrix. - public static void FDCT8x8_avx(ref Block8x8F block) + public static void FDCT8x8_Avx(ref Block8x8F block) { DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 4f7db7c59..51f29fd51 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -88,9 +88,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Matrix to store temporal results. public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp) { - block.Transpose(); + block.TransposeInplace(); IDCT8x8(ref block, ref temp); - temp.Transpose(); + temp.TransposeInplace(); IDCT8x8(ref temp, ref block); // TODO: This can be fused into quantization table step diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs index 28899b51e..f60121d33 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -12,9 +12,9 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations private Block8x8F source = Create8x8FloatData(); [Benchmark] - public float TransposeInto() + public float TransposeInplace() { - this.source.Transpose(); + this.source.TransposeInplace(); return this.source[0]; } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index 40e42acb3..d01b4b501 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -166,7 +166,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg } [Fact] - public void Transpose() + public void TransposeInplace() { static void RunTest() { @@ -176,7 +176,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg var block8x8 = default(Block8x8F); block8x8.LoadFrom(Create8x8FloatData()); - block8x8.Transpose(); + block8x8.TransposeInplace(); float[] actual = new float[64]; block8x8.ScaledCopyTo(actual); From 7831caab950e21d093b7eac8349ea6fd92d8ae2d Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 28 Sep 2021 18:59:51 +0300 Subject: [PATCH 49/56] DCT fixes, ifdef & accessor --- .../Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs | 2 -- src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index 0ebe9dbf9..7d92c3468 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -188,7 +188,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Destination public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) { -#if SUPPORTS_RUNTIME_INTRINSICS Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); Vector256 my1 = s.V1; @@ -236,7 +235,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components d.V5 = Avx.Subtract(my2, mb2); d.V3 = Avx.Add(my3, mb3); d.V4 = Avx.Subtract(my3, mb3); -#endif } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 51f29fd51..985dac1bd 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -230,7 +230,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// Source /// Destination - public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) + private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) From dce87fe2f8ffbd37ddf993e22aa83fc4dbefe69b Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 28 Sep 2021 19:02:07 +0300 Subject: [PATCH 50/56] Naming fix --- src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs | 2 +- src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs index 5a00ccd3d..0971ccdca 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -77,7 +77,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } } - private void Transpose_Avx() + private void TransposeInplace_Avx() { // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 Vector256 r0 = Avx.InsertVector128( diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 1d2b19a7b..0bd20b441 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -617,7 +617,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { - this.Transpose_Avx(); + this.TransposeInplace_Avx(); } else #endif From e4b32dbf28cabb982b14225db773ccf4110dec69 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 28 Sep 2021 21:54:08 +0300 Subject: [PATCH 51/56] Improved scalar transpose implementation --- .../Formats/Jpeg/Components/Block8x8F.cs | 63 ++++++++++++++----- .../BlockOperations/Block8x8F_Transpose.cs | 21 ++++--- 2 files changed, 60 insertions(+), 24 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 0bd20b441..02f5a1324 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -632,22 +632,55 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components [MethodImpl(InliningOptions.ShortMethod)] private void TransposeInplace_Scalar() { - float tmp; - int horIndex, verIndex; - - // We don't care about the last row as it consists of a single element - // Which won't be swapped with anything - for (int i = 0; i < 7; i++) + ref float elemRef = ref Unsafe.As(ref this); + + // row #0 + Swap(ref Unsafe.Add(ref elemRef, 1), ref Unsafe.Add(ref elemRef, 8)); + Swap(ref Unsafe.Add(ref elemRef, 2), ref Unsafe.Add(ref elemRef, 16)); + Swap(ref Unsafe.Add(ref elemRef, 3), ref Unsafe.Add(ref elemRef, 24)); + Swap(ref Unsafe.Add(ref elemRef, 4), ref Unsafe.Add(ref elemRef, 32)); + Swap(ref Unsafe.Add(ref elemRef, 5), ref Unsafe.Add(ref elemRef, 40)); + Swap(ref Unsafe.Add(ref elemRef, 6), ref Unsafe.Add(ref elemRef, 48)); + Swap(ref Unsafe.Add(ref elemRef, 7), ref Unsafe.Add(ref elemRef, 56)); + + // row #1 + Swap(ref Unsafe.Add(ref elemRef, 10), ref Unsafe.Add(ref elemRef, 17)); + Swap(ref Unsafe.Add(ref elemRef, 11), ref Unsafe.Add(ref elemRef, 25)); + Swap(ref Unsafe.Add(ref elemRef, 12), ref Unsafe.Add(ref elemRef, 33)); + Swap(ref Unsafe.Add(ref elemRef, 13), ref Unsafe.Add(ref elemRef, 41)); + Swap(ref Unsafe.Add(ref elemRef, 14), ref Unsafe.Add(ref elemRef, 49)); + Swap(ref Unsafe.Add(ref elemRef, 15), ref Unsafe.Add(ref elemRef, 57)); + + // row #2 + Swap(ref Unsafe.Add(ref elemRef, 19), ref Unsafe.Add(ref elemRef, 26)); + Swap(ref Unsafe.Add(ref elemRef, 20), ref Unsafe.Add(ref elemRef, 34)); + Swap(ref Unsafe.Add(ref elemRef, 21), ref Unsafe.Add(ref elemRef, 42)); + Swap(ref Unsafe.Add(ref elemRef, 22), ref Unsafe.Add(ref elemRef, 50)); + Swap(ref Unsafe.Add(ref elemRef, 23), ref Unsafe.Add(ref elemRef, 58)); + + // row #3 + Swap(ref Unsafe.Add(ref elemRef, 28), ref Unsafe.Add(ref elemRef, 35)); + Swap(ref Unsafe.Add(ref elemRef, 29), ref Unsafe.Add(ref elemRef, 43)); + Swap(ref Unsafe.Add(ref elemRef, 30), ref Unsafe.Add(ref elemRef, 51)); + Swap(ref Unsafe.Add(ref elemRef, 31), ref Unsafe.Add(ref elemRef, 59)); + + // row #4 + Swap(ref Unsafe.Add(ref elemRef, 37), ref Unsafe.Add(ref elemRef, 44)); + Swap(ref Unsafe.Add(ref elemRef, 38), ref Unsafe.Add(ref elemRef, 52)); + Swap(ref Unsafe.Add(ref elemRef, 39), ref Unsafe.Add(ref elemRef, 60)); + + // row #5 + Swap(ref Unsafe.Add(ref elemRef, 46), ref Unsafe.Add(ref elemRef, 53)); + Swap(ref Unsafe.Add(ref elemRef, 47), ref Unsafe.Add(ref elemRef, 61)); + + // row #6 + Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62)); + + static void Swap(ref float a, ref float b) { - // We don't care about the first element in each row as it's not swapped - for (int j = i + 1; j < 8; j++) - { - horIndex = (i * 8) + j; - verIndex = (j * 8) + i; - tmp = this[horIndex]; - this[horIndex] = this[verIndex]; - this[verIndex] = tmp; - } + float tmp = a; + a = b; + b = tmp; } } diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs index f60121d33..c2efb517a 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -35,15 +35,18 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations } /* -BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update) +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1237 (20H2/October2020Update) Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores .NET SDK=6.0.100-preview.3.21202.5 - [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT - AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT - No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT - -| Method | Job | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated | -|-------------- |---------------- |----------:|----------:|----------:|------:|------:|------:|------:|----------:| -| TransposeInto | No HwIntrinsics | 19.658 ns | 0.0550 ns | 0.0515 ns | 1.00 | - | - | - | - | -| TransposeInto | AVX | 8.613 ns | 0.0249 ns | 0.0208 ns | 0.44 | - | - | - | - | + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 2. SSE : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 3. AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + +Runtime=.NET Core 3.1 + +| Method | Job | Mean | Error | StdDev | Ratio | +|----------------- |----------------:|----------:|----------:|----------:|------:| +| TransposeInplace | No HwIntrinsics | 12.531 ns | 0.0637 ns | 0.0565 ns | 1.00 | +| TransposeInplace | AVX | 5.767 ns | 0.0529 ns | 0.0495 ns | 0.46 | */ From bd9f06f42be1d11df0b5080b04e52e577935aa26 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 28 Sep 2021 23:20:03 +0300 Subject: [PATCH 52/56] FDCT sse path via Vector4 --- .../FastFloatingPointDCT.Intrinsic.cs | 88 +---------- .../Jpeg/Components/FastFloatingPointDCT.cs | 142 ++++++++++++++---- 2 files changed, 114 insertions(+), 116 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index 7d92c3468..f40ae6e87 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -18,11 +18,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f); private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f); - private static readonly Vector128 mm128_F_0_7071 = Vector128.Create(0.707106781f); - private static readonly Vector128 mm128_F_0_3826 = Vector128.Create(0.382683433f); - private static readonly Vector128 mm128_F_0_5411 = Vector128.Create(0.541196100f); - private static readonly Vector128 mm128_F_1_3065 = Vector128.Create(1.306562965f); - private static readonly Vector256 mm256_F_1_1758 = Vector256.Create(1.175876f); private static readonly Vector256 mm256_F_n1_9615 = Vector256.Create(-1.961570560f); private static readonly Vector256 mm256_F_n0_3901 = Vector256.Create(-0.390180644f); @@ -40,92 +35,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Apply floating point FDCT inplace using simd operations. /// /// Input matrix. - private static void ForwardTransformSimd(ref Block8x8F block) + private static void ForwardTransform_Avx(ref Block8x8F block) { - DebugGuard.IsTrue(Avx.IsSupported || Sse.IsSupported, "Avx or at least Sse support is required to execute this operation."); + DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); // First pass - process rows block.TransposeInplace(); - if (Avx.IsSupported) - { - FDCT8x8_Avx(ref block); - } - else - { - // Left part - FDCT8x4_Sse(ref Unsafe.As>(ref block.V0L)); - - // Right part - FDCT8x4_Sse(ref Unsafe.As>(ref block.V0R)); - } + FDCT8x8_Avx(ref block); // Second pass - process columns block.TransposeInplace(); - if (Avx.IsSupported) - { - FDCT8x8_Avx(ref block); - } - else - { - // Left part - FDCT8x4_Sse(ref Unsafe.As>(ref block.V0L)); - - // Right part - FDCT8x4_Sse(ref Unsafe.As>(ref block.V0R)); - } - } - - /// - /// Apply 1D floating point FDCT inplace using SSE operations on 8x4 part of 8x8 matrix. - /// - /// - /// Requires Sse support. - /// Must be called on both 8x4 matrix parts for the full FDCT transform. - /// - /// Input reference to the first - public static void FDCT8x4_Sse(ref Vector128 blockRef) - { - DebugGuard.IsTrue(Sse.IsSupported, "Sse support is required to execute this operation."); - - Vector128 tmp0 = Sse.Add(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14)); - Vector128 tmp7 = Sse.Subtract(Unsafe.Add(ref blockRef, 0), Unsafe.Add(ref blockRef, 14)); - Vector128 tmp1 = Sse.Add(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12)); - Vector128 tmp6 = Sse.Subtract(Unsafe.Add(ref blockRef, 2), Unsafe.Add(ref blockRef, 12)); - Vector128 tmp2 = Sse.Add(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10)); - Vector128 tmp5 = Sse.Subtract(Unsafe.Add(ref blockRef, 4), Unsafe.Add(ref blockRef, 10)); - Vector128 tmp3 = Sse.Add(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8)); - Vector128 tmp4 = Sse.Subtract(Unsafe.Add(ref blockRef, 6), Unsafe.Add(ref blockRef, 8)); - - // Even part - Vector128 tmp10 = Sse.Add(tmp0, tmp3); - Vector128 tmp13 = Sse.Subtract(tmp0, tmp3); - Vector128 tmp11 = Sse.Add(tmp1, tmp2); - Vector128 tmp12 = Sse.Subtract(tmp1, tmp2); - - Unsafe.Add(ref blockRef, 0) = Sse.Add(tmp10, tmp11); - Unsafe.Add(ref blockRef, 8) = Sse.Subtract(tmp10, tmp11); - - Vector128 z1 = Sse.Multiply(Sse.Add(tmp12, tmp13), mm128_F_0_7071); - Unsafe.Add(ref blockRef, 4) = Sse.Add(tmp13, z1); - Unsafe.Add(ref blockRef, 12) = Sse.Subtract(tmp13, z1); - - // Odd part - tmp10 = Sse.Add(tmp4, tmp5); - tmp11 = Sse.Add(tmp5, tmp6); - tmp12 = Sse.Add(tmp6, tmp7); - - Vector128 z5 = Sse.Multiply(Sse.Subtract(tmp10, tmp12), mm128_F_0_3826); - Vector128 z2 = Sse.Add(Sse.Multiply(mm128_F_0_5411, tmp10), z5); - Vector128 z4 = Sse.Add(Sse.Multiply(mm128_F_1_3065, tmp12), z5); - Vector128 z3 = Sse.Multiply(tmp11, mm128_F_0_7071); - - Vector128 z11 = Sse.Add(tmp7, z3); - Vector128 z13 = Sse.Subtract(tmp7, z3); - - Unsafe.Add(ref blockRef, 10) = Sse.Add(z13, z2); - Unsafe.Add(ref blockRef, 6) = Sse.Subtract(z13, z2); - Unsafe.Add(ref blockRef, 2) = Sse.Add(z11, z4); - Unsafe.Add(ref blockRef, 14) = Sse.Subtract(z11, z4); + FDCT8x8_Avx(ref block); } /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 985dac1bd..43f6b7a1f 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -18,30 +18,27 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { #pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore private const float C_1_175876 = 1.175875602f; - private const float C_1_961571 = -1.961570560f; - private const float C_0_390181 = -0.390180644f; - private const float C_0_899976 = -0.899976223f; - private const float C_2_562915 = -2.562915447f; - private const float C_0_298631 = 0.298631336f; - private const float C_2_053120 = 2.053119869f; - private const float C_3_072711 = 3.072711026f; - private const float C_1_501321 = 1.501321110f; - private const float C_0_541196 = 0.541196100f; - private const float C_1_847759 = -1.847759065f; - private const float C_0_765367 = 0.765366865f; private const float C_0_125 = 0.1250f; + +#pragma warning disable SA1311, IDE1006 // naming rules violation warnings + private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f); + private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f); + private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f); + private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f); +#pragma warning restore SA1311, IDE1006 + #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore /// @@ -80,23 +77,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, }; - /// - /// Apply floating point IDCT inplace. - /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. - /// - /// Input matrix. - /// Matrix to store temporal results. - public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp) - { - block.TransposeInplace(); - IDCT8x8(ref block, ref temp); - temp.TransposeInplace(); - IDCT8x8(ref temp, ref block); - - // TODO: This can be fused into quantization table step - block.MultiplyInPlace(C_0_125); - } - /// /// Apply 2D floating point FDCT inplace. /// @@ -104,14 +84,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components public static void TransformFDCT(ref Block8x8F block) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse.IsSupported) + if (Avx.IsSupported) { - ForwardTransformSimd(ref block); + ForwardTransform_Avx(ref block); } else #endif + if (Vector.IsHardwareAccelerated) { - ForwardTransformScalar(ref block); + ForwardTransform_Vector4(ref block); + } + else + { + ForwardTransform_Scalar(ref block); } } @@ -122,7 +107,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c. /// /// Input matrix. - private static void ForwardTransformScalar(ref Block8x8F block) + private static void ForwardTransform_Scalar(ref Block8x8F block) { const int dctSize = 8; @@ -225,6 +210,99 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components } } + /// + /// Apply floating point FDCT inplace using API. + /// + /// + /// This implementation must be called only if hardware supports 4 + /// floating point numbers vector. Otherwise explicit scalar + /// implementation is faster + /// because it does not rely on matrix transposition. + /// + /// Input matrix. + private static void ForwardTransform_Vector4(ref Block8x8F block) + { + DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware."); + + // First pass - process rows + block.TransposeInplace(); + FDCT8x4_Vector4(ref block.V0L); + FDCT8x4_Vector4(ref block.V0R); + + // Second pass - process columns + block.TransposeInplace(); + FDCT8x4_Vector4(ref block.V0L); + FDCT8x4_Vector4(ref block.V0R); + } + + /// + /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix. + /// + /// + /// Implemented using Vector4 API operations for either scalar or sse hardware implementation. + /// Must be called on both 8x4 matrix parts for the full FDCT transform. + /// + /// Input reference to the first + private static void FDCT8x4_Vector4(ref Vector4 blockRef) + { + Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14); + Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14); + Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12); + Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12); + Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10); + Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10); + Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8); + Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8); + + // Even part + Vector4 tmp10 = tmp0 + tmp3; + Vector4 tmp13 = tmp0 - tmp3; + Vector4 tmp11 = tmp1 + tmp2; + Vector4 tmp12 = tmp1 - tmp2; + + Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11; + Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11; + + Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071; + Unsafe.Add(ref blockRef, 4) = tmp13 + z1; + Unsafe.Add(ref blockRef, 12) = tmp13 - z1; + + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826; + Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5; + Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5; + Vector4 z3 = tmp11 * mm128_F_0_7071; + + Vector4 z11 = tmp7 + z3; + Vector4 z13 = tmp7 - z3; + + Unsafe.Add(ref blockRef, 10) = z13 + z2; + Unsafe.Add(ref blockRef, 6) = z13 - z2; + Unsafe.Add(ref blockRef, 2) = z11 + z4; + Unsafe.Add(ref blockRef, 14) = z11 - z4; + } + + /// + /// Apply floating point IDCT inplace. + /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. + /// + /// Input matrix. + /// Matrix to store temporal results. + public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp) + { + block.TransposeInplace(); + IDCT8x8(ref block, ref temp); + temp.TransposeInplace(); + IDCT8x8(ref temp, ref block); + + // TODO: This can be fused into quantization table step + block.MultiplyInPlace(C_0_125); + } + /// /// Performs 8x8 matrix Inverse Discrete Cosine Transform /// From e9eaa5222e63ca9b11c6eaeb283060b714f2becf Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Tue, 28 Sep 2021 23:29:57 +0300 Subject: [PATCH 53/56] FDCT fma usage --- .../Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs index f40ae6e87..ab9462632 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -87,8 +87,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components tmp12 = Avx.Add(tmp6, tmp7); Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); - Vector256 z2 = Avx.Add(Avx.Multiply(mm256_F_0_5411, tmp10), z5); - Vector256 z4 = Avx.Add(Avx.Multiply(mm256_F_1_3065, tmp12), z5); + Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10); + Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12); Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071); Vector256 z11 = Avx.Add(tmp7, z3); From 4ff29844febdc5e59c0fbd33461741f197d293cf Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Fri, 1 Oct 2021 22:35:36 +0300 Subject: [PATCH 54/56] Docs --- .../Components/Encoder/HuffmanScanEncoder.cs | 120 +++++++++++++----- 1 file changed, 90 insertions(+), 30 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 35e0e2648..bbdd3220f 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -65,6 +65,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// Yields codewords by index consisting of [run length | bitsize]. private HuffmanLut[] huffmanTables; + /// + /// Emitted bits 'micro buffer' before being transferred to the . + /// + private uint accumulatedBits; + /// /// Buffer for temporal storage of huffman rle encoding bit data. /// @@ -82,18 +87,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// private readonly byte[] streamWriteBuffer; - private int emitWriteIndex; - - /// - /// Emitted bits 'micro buffer' before being transferred to the . - /// - private uint accumulatedBits; - /// /// Number of jagged bits stored in /// private int bitCount; + private int emitWriteIndex; + private Block8x8 tempBlock; /// @@ -101,9 +101,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// private readonly Stream target; - public HuffmanScanEncoder(int componentCount, Stream outputStream) + /// + /// Initializes a new instance of the class. + /// + /// Amount of encoded 8x8 blocks per single jpeg macroblock. + /// Output stream for saving encoded data. + public HuffmanScanEncoder(int blocksPerCodingUnit, Stream outputStream) { - int emitBufferByteLength = MaxBytesPerBlock * componentCount; + int emitBufferByteLength = MaxBytesPerBlock * blocksPerCodingUnit; this.emitBuffer = new uint[emitBufferByteLength / sizeof(uint)]; this.emitWriteIndex = this.emitBuffer.Length; @@ -112,7 +117,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder this.target = outputStream; } - private bool IsFlushNeeded + /// + /// Gets a value indicating whether is full + /// and must be flushed using + /// before encoding next 8x8 coding block. + /// + private bool IsStreamFlushNeeded { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => this.emitWriteIndex < (uint)this.emitBuffer.Length / 2; @@ -174,7 +184,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref pixelConverter.Cr, ref chrominanceQuantTable); - if (this.IsFlushNeeded) + if (this.IsStreamFlushNeeded) { this.FlushToStream(); } @@ -249,7 +259,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref pixelConverter.Cr, ref chrominanceQuantTable); - if (this.IsFlushNeeded) + if (this.IsStreamFlushNeeded) { this.FlushToStream(); } @@ -300,7 +310,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref pixelConverter.Y, ref luminanceQuantTable); - if (this.IsFlushNeeded) + if (this.IsStreamFlushNeeded) { this.FlushToStream(); } @@ -364,7 +374,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder ref pixelConverter.B, ref luminanceQuantTable); - if (this.IsFlushNeeded) + if (this.IsStreamFlushNeeded) { this.FlushToStream(); } @@ -447,15 +457,48 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } /// - /// Emits the least significant count of bits to the stream write buffer. - /// The precondition is bits - /// - /// < 1<<nBits && nBits <= 16 - /// - /// . + /// Emits the most significant count of bits to the buffer. /// - /// The packed bits. - /// The number of bits + /// + /// + /// Supports up to 32 count of bits but, generally speaking, jpeg + /// standard assures that there won't be more than 16 bits per single + /// value. + /// + /// + /// Emitting algorithm uses 3 intermediate buffers for caching before + /// writing to the stream: + /// + /// + /// uint32 + /// + /// Bit buffer. Encoded spectral values can occupy up to 16 bits, bits + /// are assembled to whole bytes via this intermediate buffer. + /// + /// + /// + /// uint32[] + /// + /// Assembled bytes from uint32 buffer are saved into this buffer. + /// uint32 buffer values are saved using indices from the last to the first. + /// As bytes are saved to the memory as 4-byte packages endianness matters: + /// Jpeg stream is big-endian, indexing buffer bytes from the last index to the + /// first eliminates all operations to extract separate bytes. This only works for + /// little-endian machines (there are no known examples of big-endian users atm). + /// For big-endians this approach is slower due to the separate byte extraction. + /// + /// + /// + /// byte[] + /// + /// Byte buffer used only during method. + /// + /// + /// + /// + /// + /// Bits to emit, must be shifted to the left. + /// Bits count stored in the bits parameter. [MethodImpl(InliningOptions.ShortMethod)] private void Emit(uint bits, int count) { @@ -475,10 +518,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } /// - /// Emits the given value with the given Huffman encoder. + /// Emits the given value with the given Huffman table. /// - /// Compiled Huffman spec values. - /// The value to encode. + /// Huffman table. + /// Value to encode. [MethodImpl(InliningOptions.ShortMethod)] private void EmitHuff(int[] table, int value) { @@ -489,9 +532,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// Emits given value via huffman rle encoding. /// - /// Compiled Huffman spec values. + /// Huffman table. /// The number of preceding zeroes, preshifted by 4 to the left. - /// The value to encode. + /// Value to encode. [MethodImpl(InliningOptions.ShortMethod)] private void EmitHuffRLE(int[] table, int runLength, int value) { @@ -555,11 +598,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } /// - /// Flushes cached bytes to the ouput stream respecting stuff bytes. + /// General method for flushing cached spectral data bytes to + /// the ouput stream respecting stuff bytes. /// /// - /// Bytes cached via are stored in 4-bytes blocks which makes - /// this method endianness dependent. + /// Bytes cached via are stored in 4-bytes blocks + /// which makes this method endianness dependent. /// [MethodImpl(InliningOptions.ShortMethod)] private void FlushToStream(int endIndex) @@ -623,12 +667,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder this.target.Write(this.streamWriteBuffer, 0, writeIdx); } + /// + /// Flushes spectral data bytes after encoding all channel blocks + /// in a single jpeg macroblock using . + /// + /// + /// This must be called only if is true + /// only during the macroblocks encoding routine. + /// private void FlushToStream() { this.FlushToStream(this.emitWriteIndex * 4); this.emitWriteIndex = this.emitBuffer.Length; } + /// + /// Flushes final cached bits to the stream padding 1's to + /// complement full bytes. + /// + /// + /// This must be called only once at the end of the encoding routine. + /// check is not needed. + /// [MethodImpl(InliningOptions.ShortMethod)] private void FlushRemainingBytes() { From aae451c84408afccc499c1d4e2af4de289c726c2 Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Fri, 1 Oct 2021 22:46:45 +0300 Subject: [PATCH 55/56] Quant table adjustment method --- .../Components/Encoder/HuffmanScanEncoder.cs | 38 ++++++------------- .../Jpeg/Components/FastFloatingPointDCT.cs | 19 +++++++++- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index bbdd3220f..b3cdbf0a0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -139,12 +139,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void Encode444(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < Block8x8F.Size; i++) - { - luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; - chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i]; - } + FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable); + FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable); this.huffmanTables = HuffmanLut.TheHuffmanLut; @@ -206,12 +202,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void Encode420(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < Block8x8F.Size; i++) - { - luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; - chrominanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / chrominanceQuantTable[i]; - } + FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable); + FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable); this.huffmanTables = HuffmanLut.TheHuffmanLut; @@ -279,11 +271,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void EncodeGrayscale(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < Block8x8F.Size; i++) - { - luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; - } + FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable); this.huffmanTables = HuffmanLut.TheHuffmanLut; @@ -325,16 +313,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// /// The pixel format. /// The pixel accessor providing access to the image pixels. - /// Luminance quantization table provided by the callee. + /// Quantization table provided by the callee. /// The token to monitor for cancellation. - public void EncodeRgb(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken) + public void EncodeRgb(Image pixels, ref Block8x8F quantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - // Calculate reciprocal quantization tables for FDCT method - for (int i = 0; i < Block8x8F.Size; i++) - { - luminanceQuantTable[i] = FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i] / luminanceQuantTable[i]; - } + FastFloatingPointDCT.AdjustToFDCT(ref quantTable); this.huffmanTables = HuffmanLut.TheHuffmanLut; @@ -360,19 +344,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder QuantIndex.Luminance, prevDCR, ref pixelConverter.R, - ref luminanceQuantTable); + ref quantTable); prevDCG = this.WriteBlock( QuantIndex.Luminance, prevDCG, ref pixelConverter.G, - ref luminanceQuantTable); + ref quantTable); prevDCB = this.WriteBlock( QuantIndex.Luminance, prevDCB, ref pixelConverter.B, - ref luminanceQuantTable); + ref quantTable); if (this.IsStreamFlushNeeded) { diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 43f6b7a1f..dc88255c5 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -62,10 +62,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 /// - /// Values are also scaled by 8 so DCT code won't do unnecessary division. + /// Values are also scaled by 8 so DCT code won't do extra division/multiplication. /// /// - public static readonly float[] DctReciprocalAdjustmentCoefficients = new float[] + private static readonly float[] DctReciprocalAdjustmentCoefficients = new float[] { 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, @@ -77,6 +77,21 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, }; + /// + /// Adjusts given quantization table to be complient with FDCT implementation. + /// + /// + /// See docs for explanation. + /// + /// Quantization table to adjust. + public static void AdjustToFDCT(ref Block8x8F quantizationtable) + { + for (int i = 0; i < Block8x8F.Size; i++) + { + quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i]; + } + } + /// /// Apply 2D floating point FDCT inplace. /// From 2dfbff5a90b4ec118829ba345ee5b97e4311f76b Mon Sep 17 00:00:00 2001 From: Dmitry Pentin Date: Fri, 1 Oct 2021 22:56:40 +0300 Subject: [PATCH 56/56] Access modifier fix --- src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index dc88255c5..6963c3636 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -1,7 +1,6 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. -using System; using System.Numerics; using System.Runtime.CompilerServices; #if SUPPORTS_RUNTIME_INTRINSICS @@ -65,7 +64,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// Values are also scaled by 8 so DCT code won't do extra division/multiplication. /// /// - private static readonly float[] DctReciprocalAdjustmentCoefficients = new float[] + internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[] { 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,