diff --git a/shared-infrastructure b/shared-infrastructure
index 33cb12ca77..a042aba176 160000
--- a/shared-infrastructure
+++ b/shared-infrastructure
@@ -1 +1 @@
-Subproject commit 33cb12ca77f919b44de56f344d2627cc2a108c3a
+Subproject commit a042aba176cdb840d800c6ed4cfe41a54fb7b1e3
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index 9d49b8c45f..27bb2fc3cd 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -337,6 +337,64 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
}
}
+ ///
+ /// Transpose the block inplace.
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void TransposeInplace()
+ {
+ ref short elemRef = ref Unsafe.As(ref this);
+
+ // row #0
+ Swap(ref Unsafe.Add(ref elemRef, 1), ref Unsafe.Add(ref elemRef, 8));
+ Swap(ref Unsafe.Add(ref elemRef, 2), ref Unsafe.Add(ref elemRef, 16));
+ Swap(ref Unsafe.Add(ref elemRef, 3), ref Unsafe.Add(ref elemRef, 24));
+ Swap(ref Unsafe.Add(ref elemRef, 4), ref Unsafe.Add(ref elemRef, 32));
+ Swap(ref Unsafe.Add(ref elemRef, 5), ref Unsafe.Add(ref elemRef, 40));
+ Swap(ref Unsafe.Add(ref elemRef, 6), ref Unsafe.Add(ref elemRef, 48));
+ Swap(ref Unsafe.Add(ref elemRef, 7), ref Unsafe.Add(ref elemRef, 56));
+
+ // row #1
+ Swap(ref Unsafe.Add(ref elemRef, 10), ref Unsafe.Add(ref elemRef, 17));
+ Swap(ref Unsafe.Add(ref elemRef, 11), ref Unsafe.Add(ref elemRef, 25));
+ Swap(ref Unsafe.Add(ref elemRef, 12), ref Unsafe.Add(ref elemRef, 33));
+ Swap(ref Unsafe.Add(ref elemRef, 13), ref Unsafe.Add(ref elemRef, 41));
+ Swap(ref Unsafe.Add(ref elemRef, 14), ref Unsafe.Add(ref elemRef, 49));
+ Swap(ref Unsafe.Add(ref elemRef, 15), ref Unsafe.Add(ref elemRef, 57));
+
+ // row #2
+ Swap(ref Unsafe.Add(ref elemRef, 19), ref Unsafe.Add(ref elemRef, 26));
+ Swap(ref Unsafe.Add(ref elemRef, 20), ref Unsafe.Add(ref elemRef, 34));
+ Swap(ref Unsafe.Add(ref elemRef, 21), ref Unsafe.Add(ref elemRef, 42));
+ Swap(ref Unsafe.Add(ref elemRef, 22), ref Unsafe.Add(ref elemRef, 50));
+ Swap(ref Unsafe.Add(ref elemRef, 23), ref Unsafe.Add(ref elemRef, 58));
+
+ // row #3
+ Swap(ref Unsafe.Add(ref elemRef, 28), ref Unsafe.Add(ref elemRef, 35));
+ Swap(ref Unsafe.Add(ref elemRef, 29), ref Unsafe.Add(ref elemRef, 43));
+ Swap(ref Unsafe.Add(ref elemRef, 30), ref Unsafe.Add(ref elemRef, 51));
+ Swap(ref Unsafe.Add(ref elemRef, 31), ref Unsafe.Add(ref elemRef, 59));
+
+ // row #4
+ Swap(ref Unsafe.Add(ref elemRef, 37), ref Unsafe.Add(ref elemRef, 44));
+ Swap(ref Unsafe.Add(ref elemRef, 38), ref Unsafe.Add(ref elemRef, 52));
+ Swap(ref Unsafe.Add(ref elemRef, 39), ref Unsafe.Add(ref elemRef, 60));
+
+ // row #5
+ Swap(ref Unsafe.Add(ref elemRef, 46), ref Unsafe.Add(ref elemRef, 53));
+ Swap(ref Unsafe.Add(ref elemRef, 47), ref Unsafe.Add(ref elemRef, 61));
+
+ // row #6
+ Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62));
+
+ static void Swap(ref short a, ref short b)
+ {
+ short tmp = a;
+ a = b;
+ b = tmp;
+ }
+ }
+
///
/// Calculate the total sum of absolute differences of elements in 'a' and 'b'.
///
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
index bb7b1fe78f..ce5e5110b6 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
@@ -151,6 +151,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
if (this.componentsCount == this.frame.ComponentCount)
{
this.ParseBaselineDataInterleaved();
+ this.spectralConverter.CommitConversion();
}
else
{
@@ -501,7 +502,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
{
i += r;
s = buffer.Receive(s);
- Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s;
+ Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[i++]) = (short)s;
}
else
{
@@ -570,7 +571,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
if (s != 0)
{
s = buffer.Receive(s);
- Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low);
+ Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[i]) = (short)(s << low);
}
else
{
@@ -646,7 +647,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
do
{
- ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
+ ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]);
if (coef != 0)
{
buffer.CheckBits();
@@ -672,7 +673,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
if ((s != 0) && (k < 64))
{
- Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s;
+ Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]) = (short)s;
}
}
}
@@ -681,7 +682,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
{
for (; k <= end; k++)
{
- ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
+ ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]);
if (coef != 0)
{
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
index 085cd4a291..15f212b400 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@@ -18,11 +18,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
///
public Block8x8F SourceBlock;
- ///
- /// Temporal block to store intermediate computation results.
- ///
- public Block8x8F WorkspaceBlock;
-
///
/// The quantization table as .
///
@@ -45,7 +40,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
this.subSamplingDivisors = component.SubSamplingDivisors;
this.SourceBlock = default;
- this.WorkspaceBlock = default;
}
///
@@ -71,7 +65,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
// Dequantize:
block.MultiplyInPlace(ref this.DequantiazationTable);
- FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock);
+ FastFloatingPointDCT.TransformIDCT(ref block);
// To conform better to libjpeg we actually NEED TO loose precision here.
// This is because they store blocks as Int16 between all the operations.
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
index e975b11fbb..4e74f62269 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
@@ -13,6 +13,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
///
internal abstract class SpectralConverter
{
+ ///
+ /// Gets a value indicating whether this converter has converted spectral
+ /// data of the current image or not.
+ ///
+ protected bool Converted { get; private set; }
+
///
/// Injects jpeg image decoding metadata.
///
@@ -33,6 +39,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
///
public abstract void ConvertStrideBaseline();
+ ///
+ /// Marks current converter state as 'converted'.
+ ///
+ ///
+ /// This must be called only for baseline interleaved jpeg's.
+ ///
+ public void CommitConversion()
+ {
+ DebugGuard.IsFalse(this.Converted, nameof(this.Converted), $"{nameof(this.CommitConversion)} must be called only once");
+
+ this.Converted = true;
+ }
+
///
/// Gets the color converter.
///
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs
index 492c00c056..0c551bc36b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs
@@ -3,6 +3,7 @@
using System;
using System.Buffers;
+using System.Linq;
using System.Numerics;
using System.Threading;
using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters;
@@ -29,8 +30,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
private Buffer2D pixelBuffer;
- private int blockRowsPerStep;
-
private int pixelRowsPerStep;
private int pixelRowCounter;
@@ -41,8 +40,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
this.cancellationToken = cancellationToken;
}
- private bool Converted => this.pixelRowCounter >= this.pixelBuffer.Height;
-
public Buffer2D GetPixelBuffer()
{
if (!this.Converted)
@@ -52,7 +49,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
for (int step = 0; step < steps; step++)
{
this.cancellationToken.ThrowIfCancellationRequested();
- this.ConvertNextStride(step);
+ this.ConvertStride(step);
}
}
@@ -65,18 +62,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
MemoryAllocator allocator = this.configuration.MemoryAllocator;
// iteration data
- IJpegComponent c0 = frame.Components[0];
+ int majorBlockWidth = frame.Components.Max((component) => component.SizeInBlocks.Width);
+ int majorVerticalSamplingFactor = frame.Components.Max((component) => component.SamplingFactors.Height);
const int blockPixelHeight = 8;
- this.blockRowsPerStep = c0.SamplingFactors.Height;
- this.pixelRowsPerStep = this.blockRowsPerStep * blockPixelHeight;
+ this.pixelRowsPerStep = majorVerticalSamplingFactor * blockPixelHeight;
// pixel buffer for resulting image
this.pixelBuffer = allocator.Allocate2D(frame.PixelWidth, frame.PixelHeight);
this.paddedProxyPixelRow = allocator.Allocate(frame.PixelWidth + 3);
// component processors from spectral to Rgba32
- var postProcessorBufferSize = new Size(c0.SizeInBlocks.Width * 8, this.pixelRowsPerStep);
+ const int blockPixelWidth = 8;
+ var postProcessorBufferSize = new Size(majorBlockWidth * blockPixelWidth, this.pixelRowsPerStep);
this.componentProcessors = new JpegComponentPostProcessor[frame.Components.Length];
for (int i = 0; i < this.componentProcessors.Length; i++)
{
@@ -84,7 +82,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
}
// single 'stride' rgba32 buffer for conversion between spectral and TPixel
- // this.rgbaBuffer = allocator.Allocate(frame.PixelWidth);
this.rgbBuffer = allocator.Allocate(frame.PixelWidth * 3);
// color converter from Rgba32 to TPixel
@@ -95,18 +92,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
public override void ConvertStrideBaseline()
{
// Convert next pixel stride using single spectral `stride'
- // Note that zero passing eliminates the need of virtual call from JpegComponentPostProcessor
- this.ConvertNextStride(spectralStep: 0);
+ // Note that zero passing eliminates the need of virtual call
+ // from JpegComponentPostProcessor
+ this.ConvertStride(spectralStep: 0);
- // Clear spectral stride - this is VERY important as jpeg possibly won't fill entire buffer each stride
- // Which leads to decoding artifacts
- // Note that this code clears all buffers of the post processors, it's their responsibility to allocate only single stride
foreach (JpegComponentPostProcessor cpp in this.componentProcessors)
{
cpp.ClearSpectralBuffers();
}
}
+ ///
public void Dispose()
{
if (this.componentProcessors != null)
@@ -121,7 +117,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
this.paddedProxyPixelRow?.Dispose();
}
- private void ConvertNextStride(int spectralStep)
+ private void ConvertStride(int spectralStep)
{
int maxY = Math.Min(this.pixelBuffer.Height, this.pixelRowCounter + this.pixelRowsPerStep);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index ab9462632f..94864005ec 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -2,9 +2,6 @@
// Licensed under the Apache License, Version 2.0.
#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Diagnostics;
-using System.Numerics;
-using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
@@ -12,149 +9,147 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
internal static partial class FastFloatingPointDCT
{
-#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rule violation warnings
private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f);
private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f);
private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f);
private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f);
- private static readonly Vector256 mm256_F_1_1758 = Vector256.Create(1.175876f);
- private static readonly Vector256 mm256_F_n1_9615 = Vector256.Create(-1.961570560f);
- private static readonly Vector256 mm256_F_n0_3901 = Vector256.Create(-0.390180644f);
- private static readonly Vector256 mm256_F_n0_8999 = Vector256.Create(-0.899976223f);
- private static readonly Vector256 mm256_F_n2_5629 = Vector256.Create(-2.562915447f);
- private static readonly Vector256 mm256_F_0_2986 = Vector256.Create(0.298631336f);
- private static readonly Vector256 mm256_F_2_0531 = Vector256.Create(2.053119869f);
- private static readonly Vector256 mm256_F_3_0727 = Vector256.Create(3.072711026f);
- private static readonly Vector256 mm256_F_1_5013 = Vector256.Create(1.501321110f);
- private static readonly Vector256 mm256_F_n1_8477 = Vector256.Create(-1.847759065f);
- private static readonly Vector256 mm256_F_0_7653 = Vector256.Create(0.765366865f);
+ private static readonly Vector256 mm256_F_1_4142 = Vector256.Create(1.414213562f);
+ private static readonly Vector256 mm256_F_1_8477 = Vector256.Create(1.847759065f);
+ private static readonly Vector256 mm256_F_n1_0823 = Vector256.Create(-1.082392200f);
+ private static readonly Vector256 mm256_F_n2_6131 = Vector256.Create(-2.613125930f);
#pragma warning restore SA1310, SA1311, IDE1006
///
/// Apply floating point FDCT inplace using simd operations.
///
- /// Input matrix.
- private static void ForwardTransform_Avx(ref Block8x8F block)
+ /// Input block.
+ private static void FDCT8x8_Avx(ref Block8x8F block)
{
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
// First pass - process rows
block.TransposeInplace();
- FDCT8x8_Avx(ref block);
+ FDCT8x8_1D_Avx(ref block);
// Second pass - process columns
block.TransposeInplace();
- FDCT8x8_Avx(ref block);
+ FDCT8x8_1D_Avx(ref block);
+
+ // Applies 1D floating point FDCT inplace
+ static void FDCT8x8_1D_Avx(ref Block8x8F block)
+ {
+ Vector256 tmp0 = Avx.Add(block.V0, block.V7);
+ Vector256 tmp7 = Avx.Subtract(block.V0, block.V7);
+ Vector256 tmp1 = Avx.Add(block.V1, block.V6);
+ Vector256 tmp6 = Avx.Subtract(block.V1, block.V6);
+ Vector256 tmp2 = Avx.Add(block.V2, block.V5);
+ Vector256 tmp5 = Avx.Subtract(block.V2, block.V5);
+ Vector256 tmp3 = Avx.Add(block.V3, block.V4);
+ Vector256 tmp4 = Avx.Subtract(block.V3, block.V4);
+
+ // Even part
+ Vector256 tmp10 = Avx.Add(tmp0, tmp3);
+ Vector256 tmp13 = Avx.Subtract(tmp0, tmp3);
+ Vector256 tmp11 = Avx.Add(tmp1, tmp2);
+ Vector256 tmp12 = Avx.Subtract(tmp1, tmp2);
+
+ block.V0 = Avx.Add(tmp10, tmp11);
+ block.V4 = Avx.Subtract(tmp10, tmp11);
+
+ Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
+ block.V2 = Avx.Add(tmp13, z1);
+ block.V6 = Avx.Subtract(tmp13, z1);
+
+ // Odd part
+ tmp10 = Avx.Add(tmp4, tmp5);
+ tmp11 = Avx.Add(tmp5, tmp6);
+ tmp12 = Avx.Add(tmp6, tmp7);
+
+ Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
+ Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
+ Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
+ Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
+
+ Vector256 z11 = Avx.Add(tmp7, z3);
+ Vector256 z13 = Avx.Subtract(tmp7, z3);
+
+ block.V5 = Avx.Add(z13, z2);
+ block.V3 = Avx.Subtract(z13, z2);
+ block.V1 = Avx.Add(z11, z4);
+ block.V7 = Avx.Subtract(z11, z4);
+ }
}
///
- /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix.
+ /// Apply floating point IDCT inplace using simd operations.
///
- ///
- /// Requires Avx support.
- ///
- /// Input matrix.
- public static void FDCT8x8_Avx(ref Block8x8F block)
+ /// Transposed input block.
+ private static void IDCT8x8_Avx(ref Block8x8F transposedBlock)
{
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
- Vector256 tmp0 = Avx.Add(block.V0, block.V7);
- Vector256 tmp7 = Avx.Subtract(block.V0, block.V7);
- Vector256 tmp1 = Avx.Add(block.V1, block.V6);
- Vector256 tmp6 = Avx.Subtract(block.V1, block.V6);
- Vector256 tmp2 = Avx.Add(block.V2, block.V5);
- Vector256 tmp5 = Avx.Subtract(block.V2, block.V5);
- Vector256 tmp3 = Avx.Add(block.V3, block.V4);
- Vector256 tmp4 = Avx.Subtract(block.V3, block.V4);
-
- // Even part
- Vector256 tmp10 = Avx.Add(tmp0, tmp3);
- Vector256 tmp13 = Avx.Subtract(tmp0, tmp3);
- Vector256 tmp11 = Avx.Add(tmp1, tmp2);
- Vector256 tmp12 = Avx.Subtract(tmp1, tmp2);
-
- block.V0 = Avx.Add(tmp10, tmp11);
- block.V4 = Avx.Subtract(tmp10, tmp11);
-
- Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
- block.V2 = Avx.Add(tmp13, z1);
- block.V6 = Avx.Subtract(tmp13, z1);
-
- // Odd part
- tmp10 = Avx.Add(tmp4, tmp5);
- tmp11 = Avx.Add(tmp5, tmp6);
- tmp12 = Avx.Add(tmp6, tmp7);
-
- Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
- Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
- Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
- Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
-
- Vector256 z11 = Avx.Add(tmp7, z3);
- Vector256 z13 = Avx.Subtract(tmp7, z3);
-
- block.V5 = Avx.Add(z13, z2);
- block.V3 = Avx.Subtract(z13, z2);
- block.V1 = Avx.Add(z11, z4);
- block.V7 = Avx.Subtract(z11, z4);
- }
-
- ///
- /// Combined operation of and
- /// using AVX commands.
- ///
- /// Source
- /// Destination
- public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
- {
- Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
- Vector256 my1 = s.V1;
- Vector256 my7 = s.V7;
- Vector256 mz0 = Avx.Add(my1, my7);
-
- Vector256 my3 = s.V3;
- Vector256 mz2 = Avx.Add(my3, my7);
- Vector256 my5 = s.V5;
- Vector256 mz1 = Avx.Add(my3, my5);
- Vector256 mz3 = Avx.Add(my1, my5);
-
- Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758);
-
- mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615);
- mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901);
- mz0 = Avx.Multiply(mz0, mm256_F_n0_8999);
- mz1 = Avx.Multiply(mz1, mm256_F_n2_5629);
-
- Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2);
- Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3);
- Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2);
- Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3);
-
- Vector256 my2 = s.V2;
- Vector256 my6 = s.V6;
- mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411);
- Vector256 my0 = s.V0;
- Vector256 my4 = s.V4;
- mz0 = Avx.Add(my0, my4);
- mz1 = Avx.Subtract(my0, my4);
- mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477);
- mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653);
-
- my0 = Avx.Add(mz0, mz3);
- my3 = Avx.Subtract(mz0, mz3);
- my1 = Avx.Add(mz1, mz2);
- my2 = Avx.Subtract(mz1, mz2);
-
- d.V0 = Avx.Add(my0, mb0);
- d.V7 = Avx.Subtract(my0, mb0);
- d.V1 = Avx.Add(my1, mb1);
- d.V6 = Avx.Subtract(my1, mb1);
- d.V2 = Avx.Add(my2, mb2);
- d.V5 = Avx.Subtract(my2, mb2);
- d.V3 = Avx.Add(my3, mb3);
- d.V4 = Avx.Subtract(my3, mb3);
+ // First pass - process columns
+ IDCT8x8_1D_Avx(ref transposedBlock);
+
+ // Second pass - process rows
+ transposedBlock.TransposeInplace();
+ IDCT8x8_1D_Avx(ref transposedBlock);
+
+ // Applies 1D floating point FDCT inplace
+ static void IDCT8x8_1D_Avx(ref Block8x8F block)
+ {
+ // Even part
+ Vector256 tmp0 = block.V0;
+ Vector256 tmp1 = block.V2;
+ Vector256 tmp2 = block.V4;
+ Vector256 tmp3 = block.V6;
+
+ Vector256 z5 = tmp0;
+ Vector256 tmp10 = Avx.Add(z5, tmp2);
+ Vector256 tmp11 = Avx.Subtract(z5, tmp2);
+
+ Vector256 tmp13 = Avx.Add(tmp1, tmp3);
+ Vector256 tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
+
+ tmp0 = Avx.Add(tmp10, tmp13);
+ tmp3 = Avx.Subtract(tmp10, tmp13);
+ tmp1 = Avx.Add(tmp11, tmp12);
+ tmp2 = Avx.Subtract(tmp11, tmp12);
+
+ // Odd part
+ Vector256 tmp4 = block.V1;
+ Vector256 tmp5 = block.V3;
+ Vector256 tmp6 = block.V5;
+ Vector256 tmp7 = block.V7;
+
+ Vector256 z13 = Avx.Add(tmp6, tmp5);
+ Vector256 z10 = Avx.Subtract(tmp6, tmp5);
+ Vector256 z11 = Avx.Add(tmp4, tmp7);
+ Vector256 z12 = Avx.Subtract(tmp4, tmp7);
+
+ tmp7 = Avx.Add(z11, z13);
+ tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);
+
+ z5 = Avx.Multiply(Avx.Add(z10, z12), mm256_F_1_8477);
+
+ tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, mm256_F_n1_0823);
+ tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, mm256_F_n2_6131);
+
+ tmp6 = Avx.Subtract(tmp12, tmp7);
+ tmp5 = Avx.Subtract(tmp11, tmp6);
+ tmp4 = Avx.Subtract(tmp10, tmp5);
+
+ block.V0 = Avx.Add(tmp0, tmp7);
+ block.V7 = Avx.Subtract(tmp0, tmp7);
+ block.V1 = Avx.Add(tmp1, tmp6);
+ block.V6 = Avx.Subtract(tmp1, tmp6);
+ block.V2 = Avx.Add(tmp2, tmp5);
+ block.V5 = Avx.Subtract(tmp2, tmp5);
+ block.V3 = Avx.Add(tmp3, tmp4);
+ block.V4 = Avx.Subtract(tmp3, tmp4);
+ }
}
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 6963c36369..81bfe2135d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -3,6 +3,7 @@
using System.Numerics;
using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics.X86;
#endif
@@ -15,102 +16,202 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
///
internal static partial class FastFloatingPointDCT
{
-#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
- private const float C_1_175876 = 1.175875602f;
- private const float C_1_961571 = -1.961570560f;
- private const float C_0_390181 = -0.390180644f;
- private const float C_0_899976 = -0.899976223f;
- private const float C_2_562915 = -2.562915447f;
- private const float C_0_298631 = 0.298631336f;
- private const float C_2_053120 = 2.053119869f;
- private const float C_3_072711 = 3.072711026f;
- private const float C_1_501321 = 1.501321110f;
- private const float C_0_541196 = 0.541196100f;
- private const float C_1_847759 = -1.847759065f;
- private const float C_0_765367 = 0.765366865f;
-
- private const float C_0_125 = 0.1250f;
-
-#pragma warning disable SA1311, IDE1006 // naming rules violation warnings
- private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f);
- private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f);
- private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f);
- private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f);
-#pragma warning restore SA1311, IDE1006
-
-#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+ private static readonly Vector4 mm128_F_0_7071 = new(0.707106781f);
+ private static readonly Vector4 mm128_F_0_3826 = new(0.382683433f);
+ private static readonly Vector4 mm128_F_0_5411 = new(0.541196100f);
+ private static readonly Vector4 mm128_F_1_3065 = new(1.306562965f);
+
+ private static readonly Vector4 mm128_F_1_4142 = new(1.414213562f);
+ private static readonly Vector4 mm128_F_1_8477 = new(1.847759065f);
+ private static readonly Vector4 mm128_F_n1_0823 = new(-1.082392200f);
+ private static readonly Vector4 mm128_F_n2_6131 = new(-2.613125930f);
+#pragma warning restore SA1310, SA1311, IDE1006
///
- /// Gets reciprocal coefficients for jpeg quantization tables calculation.
+ /// Gets adjustment table for quantization tables.
///
///
///
- /// Current FDCT implementation expects its results to be multiplied by
- /// a reciprocal quantization table. To get 8x8 reciprocal block values in this
- /// table must be divided by quantization table values scaled with quality settings.
+ /// Current IDCT and FDCT implementations are based on Arai, Agui,
+ /// and Nakajima's algorithm. Both DCT methods does not
+ /// produce finished DCT output, final step is fused into the
+ /// quantization step. Quantization and de-quantization coefficients
+ /// must be multiplied by these values.
///
///
- /// These values were calculates with this formula:
- ///
- /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
- ///
- /// Where:
+ /// Given values were generated by formula:
///
+ /// scalefactor[row] * scalefactor[col], where
/// scalefactor[0] = 1
- ///
- ///
/// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
///
- /// Values are also scaled by 8 so DCT code won't do extra division/multiplication.
///
///
- internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[]
+ private static readonly float[] AdjustmentCoefficients = new float[]
{
- 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
- 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
- 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
- 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
- 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
- 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
- 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
- 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
+ 1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f,
+ 1.3870399f, 1.9238797f, 1.812255f, 1.6309863f, 1.3870399f, 1.0897902f, 0.7506606f, 0.38268346f,
+ 1.306563f, 1.812255f, 1.707107f, 1.5363555f, 1.306563f, 1.02656f, 0.7071068f, 0.36047992f,
+ 1.1758755f, 1.6309863f, 1.5363555f, 1.3826833f, 1.1758755f, 0.9238795f, 0.63637924f, 0.32442334f,
+ 1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f,
+ 0.78569496f, 1.0897902f, 1.02656f, 0.9238795f, 0.78569496f, 0.61731654f, 0.42521507f, 0.21677275f,
+ 0.5411961f, 0.7506606f, 0.7071068f, 0.63637924f, 0.5411961f, 0.42521507f, 0.29289323f, 0.14931567f,
+ 0.27589938f, 0.38268346f, 0.36047992f, 0.32442334f, 0.27589938f, 0.21677275f, 0.14931567f, 0.076120466f,
};
///
- /// Adjusts given quantization table to be complient with FDCT implementation.
+ /// Adjusts given quantization table for usage with .
+ ///
+ /// Quantization table to adjust.
+ public static void AdjustToIDCT(ref Block8x8F quantTable)
+ {
+ ref float tableRef = ref Unsafe.As(ref quantTable);
+ ref float multipliersRef = ref MemoryMarshal.GetReference(AdjustmentCoefficients);
+ for (nint i = 0; i < Block8x8F.Size; i++)
+ {
+ tableRef = 0.125f * tableRef * Unsafe.Add(ref multipliersRef, i);
+ tableRef = ref Unsafe.Add(ref tableRef, 1);
+ }
+
+ // Spectral macroblocks are transposed before quantization
+ // so we must transpose quantization table
+ quantTable.TransposeInplace();
+ }
+
+ ///
+ /// Adjusts given quantization table for usage with .
+ ///
+ /// Quantization table to adjust.
+ public static void AdjustToFDCT(ref Block8x8F quantTable)
+ {
+ ref float tableRef = ref Unsafe.As(ref quantTable);
+ ref float multipliersRef = ref MemoryMarshal.GetReference(AdjustmentCoefficients);
+ for (nint i = 0; i < Block8x8F.Size; i++)
+ {
+ tableRef = 0.125f / (tableRef * Unsafe.Add(ref multipliersRef, i));
+ tableRef = ref Unsafe.Add(ref tableRef, 1);
+ }
+ }
+
+ ///
+ /// Apply 2D floating point IDCT inplace.
///
///
- /// See docs for explanation.
+ /// Input block must be dequantized before this method with table
+ /// adjusted by .
///
- /// Quantization table to adjust.
- public static void AdjustToFDCT(ref Block8x8F quantizationtable)
+ /// Input block.
+ public static void TransformIDCT(ref Block8x8F block)
{
- for (int i = 0; i < Block8x8F.Size; i++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx.IsSupported)
{
- quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i];
+ IDCT8x8_Avx(ref block);
+ }
+ else
+#endif
+ {
+ IDCT_Vector4(ref block);
}
}
///
- /// Apply 2D floating point FDCT inplace.
+ /// Apply 2D floating point IDCT inplace.
///
- /// Input matrix.
+ ///
+ /// Input block must be quantized after this method with table adjusted
+ /// by .
+ ///
+ /// Input block.
public static void TransformFDCT(ref Block8x8F block)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{
- ForwardTransform_Avx(ref block);
+ FDCT8x8_Avx(ref block);
}
else
#endif
if (Vector.IsHardwareAccelerated)
{
- ForwardTransform_Vector4(ref block);
+ FDCT_Vector4(ref block);
}
else
{
- ForwardTransform_Scalar(ref block);
+ FDCT_Scalar(ref block);
+ }
+ }
+
+ ///
+ /// Apply floating point IDCT inplace using API.
+ ///
+ /// Input block.
+ private static void IDCT_Vector4(ref Block8x8F transposedBlock)
+ {
+ DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
+
+ // First pass - process columns
+ IDCT8x4_Vector4(ref transposedBlock.V0L);
+ IDCT8x4_Vector4(ref transposedBlock.V0R);
+
+ // Second pass - process rows
+ transposedBlock.TransposeInplace();
+ IDCT8x4_Vector4(ref transposedBlock.V0L);
+ IDCT8x4_Vector4(ref transposedBlock.V0R);
+
+ // Applies 1D floating point IDCT inplace on 8x4 part of 8x8 block
+ static void IDCT8x4_Vector4(ref Vector4 vecRef)
+ {
+ // Even part
+ Vector4 tmp0 = Unsafe.Add(ref vecRef, 0 * 2);
+ Vector4 tmp1 = Unsafe.Add(ref vecRef, 2 * 2);
+ Vector4 tmp2 = Unsafe.Add(ref vecRef, 4 * 2);
+ Vector4 tmp3 = Unsafe.Add(ref vecRef, 6 * 2);
+
+ Vector4 z5 = tmp0;
+ Vector4 tmp10 = z5 + tmp2;
+ Vector4 tmp11 = z5 - tmp2;
+
+ Vector4 tmp13 = tmp1 + tmp3;
+ Vector4 tmp12 = ((tmp1 - tmp3) * mm128_F_1_4142) - tmp13;
+
+ tmp0 = tmp10 + tmp13;
+ tmp3 = tmp10 - tmp13;
+ tmp1 = tmp11 + tmp12;
+ tmp2 = tmp11 - tmp12;
+
+ // Odd part
+ Vector4 tmp4 = Unsafe.Add(ref vecRef, 1 * 2);
+ Vector4 tmp5 = Unsafe.Add(ref vecRef, 3 * 2);
+ Vector4 tmp6 = Unsafe.Add(ref vecRef, 5 * 2);
+ Vector4 tmp7 = Unsafe.Add(ref vecRef, 7 * 2);
+
+ Vector4 z13 = tmp6 + tmp5;
+ Vector4 z10 = tmp6 - tmp5;
+ Vector4 z11 = tmp4 + tmp7;
+ Vector4 z12 = tmp4 - tmp7;
+
+ tmp7 = z11 + z13;
+ tmp11 = (z11 - z13) * mm128_F_1_4142;
+
+ z5 = (z10 + z12) * mm128_F_1_8477;
+
+ tmp10 = (z12 * mm128_F_n1_0823) + z5;
+ tmp12 = (z10 * mm128_F_n2_6131) + z5;
+
+ tmp6 = tmp12 - tmp7;
+ tmp5 = tmp11 - tmp6;
+ tmp4 = tmp10 - tmp5;
+
+ Unsafe.Add(ref vecRef, 0 * 2) = tmp0 + tmp7;
+ Unsafe.Add(ref vecRef, 7 * 2) = tmp0 - tmp7;
+ Unsafe.Add(ref vecRef, 1 * 2) = tmp1 + tmp6;
+ Unsafe.Add(ref vecRef, 6 * 2) = tmp1 - tmp6;
+ Unsafe.Add(ref vecRef, 2 * 2) = tmp2 + tmp5;
+ Unsafe.Add(ref vecRef, 5 * 2) = tmp2 - tmp5;
+ Unsafe.Add(ref vecRef, 3 * 2) = tmp3 + tmp4;
+ Unsafe.Add(ref vecRef, 4 * 2) = tmp3 - tmp4;
}
}
@@ -120,8 +221,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
///
/// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
///
- /// Input matrix.
- private static void ForwardTransform_Scalar(ref Block8x8F block)
+ /// Input block.
+ private static void FDCT_Scalar(ref Block8x8F block)
{
const int dctSize = 8;
@@ -130,17 +231,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
float z1, z2, z3, z4, z5, z11, z13;
// First pass - process rows
- ref float dataRef = ref Unsafe.As(ref block);
+ ref float blockRef = ref Unsafe.As(ref block);
for (int ctr = 7; ctr >= 0; ctr--)
{
- tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7);
- tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7);
- tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6);
- tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6);
- tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5);
- tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5);
- tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4);
- tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4);
+ tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7);
+ tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7);
+ tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6);
+ tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6);
+ tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5);
+ tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5);
+ tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4);
+ tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4);
// Even part
tmp10 = tmp0 + tmp3;
@@ -148,12 +249,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
- Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11;
- Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11;
+ Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
+ Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11;
z1 = (tmp12 + tmp13) * 0.707106781f;
- Unsafe.Add(ref dataRef, 2) = tmp13 + z1;
- Unsafe.Add(ref dataRef, 6) = tmp13 - z1;
+ Unsafe.Add(ref blockRef, 2) = tmp13 + z1;
+ Unsafe.Add(ref blockRef, 6) = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
@@ -168,26 +269,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
z11 = tmp7 + z3;
z13 = tmp7 - z3;
- Unsafe.Add(ref dataRef, 5) = z13 + z2;
- Unsafe.Add(ref dataRef, 3) = z13 - z2;
- Unsafe.Add(ref dataRef, 1) = z11 + z4;
- Unsafe.Add(ref dataRef, 7) = z11 - z4;
+ Unsafe.Add(ref blockRef, 5) = z13 + z2;
+ Unsafe.Add(ref blockRef, 3) = z13 - z2;
+ Unsafe.Add(ref blockRef, 1) = z11 + z4;
+ Unsafe.Add(ref blockRef, 7) = z11 - z4;
- dataRef = ref Unsafe.Add(ref dataRef, dctSize);
+ blockRef = ref Unsafe.Add(ref blockRef, dctSize);
}
// Second pass - process columns
- dataRef = ref Unsafe.As(ref block);
+ blockRef = ref Unsafe.As(ref block);
for (int ctr = 7; ctr >= 0; ctr--)
{
- tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7);
- tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7);
- tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6);
- tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6);
- tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5);
- tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5);
- tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4);
- tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4);
+ tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7);
+ tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7);
+ tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6);
+ tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6);
+ tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5);
+ tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5);
+ tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4);
+ tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4);
// Even part
tmp10 = tmp0 + tmp3;
@@ -195,12 +296,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
- Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11;
- Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11;
+ Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11;
+ Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11;
z1 = (tmp12 + tmp13) * 0.707106781f;
- Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1;
- Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1;
+ Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1;
+ Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
@@ -215,12 +316,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
z11 = tmp7 + z3;
z13 = tmp7 - z3;
- Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2;
- Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2;
- Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4;
- Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4;
+ Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2;
+ Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2;
+ Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4;
+ Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4;
- dataRef = ref Unsafe.Add(ref dataRef, 1);
+ blockRef = ref Unsafe.Add(ref blockRef, 1);
}
}
@@ -230,11 +331,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
///
/// This implementation must be called only if hardware supports 4
/// floating point numbers vector. Otherwise explicit scalar
- /// implementation is faster
- /// because it does not rely on matrix transposition.
+ /// implementation is faster
+ /// because it does not rely on block transposition.
///
- /// Input matrix.
- private static void ForwardTransform_Vector4(ref Block8x8F block)
+ /// Input block.
+ public static void FDCT_Vector4(ref Block8x8F block)
{
DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
@@ -247,209 +348,50 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
block.TransposeInplace();
FDCT8x4_Vector4(ref block.V0L);
FDCT8x4_Vector4(ref block.V0R);
- }
- ///
- /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix.
- ///
- ///
- /// Implemented using Vector4 API operations for either scalar or sse hardware implementation.
- /// Must be called on both 8x4 matrix parts for the full FDCT transform.
- ///
- /// Input reference to the first
- private static void FDCT8x4_Vector4(ref Vector4 blockRef)
- {
- Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14);
- Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14);
- Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12);
- Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12);
- Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10);
- Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10);
- Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8);
- Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8);
-
- // Even part
- Vector4 tmp10 = tmp0 + tmp3;
- Vector4 tmp13 = tmp0 - tmp3;
- Vector4 tmp11 = tmp1 + tmp2;
- Vector4 tmp12 = tmp1 - tmp2;
-
- Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
- Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11;
-
- Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
- Unsafe.Add(ref blockRef, 4) = tmp13 + z1;
- Unsafe.Add(ref blockRef, 12) = tmp13 - z1;
-
- // Odd part
- tmp10 = tmp4 + tmp5;
- tmp11 = tmp5 + tmp6;
- tmp12 = tmp6 + tmp7;
-
- Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
- Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
- Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
- Vector4 z3 = tmp11 * mm128_F_0_7071;
-
- Vector4 z11 = tmp7 + z3;
- Vector4 z13 = tmp7 - z3;
-
- Unsafe.Add(ref blockRef, 10) = z13 + z2;
- Unsafe.Add(ref blockRef, 6) = z13 - z2;
- Unsafe.Add(ref blockRef, 2) = z11 + z4;
- Unsafe.Add(ref blockRef, 14) = z11 - z4;
- }
+ // Applies 1D floating point FDCT inplace on 8x4 part of 8x8 block
+ static void FDCT8x4_Vector4(ref Vector4 vecRef)
+ {
+ Vector4 tmp0 = Unsafe.Add(ref vecRef, 0) + Unsafe.Add(ref vecRef, 14);
+ Vector4 tmp7 = Unsafe.Add(ref vecRef, 0) - Unsafe.Add(ref vecRef, 14);
+ Vector4 tmp1 = Unsafe.Add(ref vecRef, 2) + Unsafe.Add(ref vecRef, 12);
+ Vector4 tmp6 = Unsafe.Add(ref vecRef, 2) - Unsafe.Add(ref vecRef, 12);
+ Vector4 tmp2 = Unsafe.Add(ref vecRef, 4) + Unsafe.Add(ref vecRef, 10);
+ Vector4 tmp5 = Unsafe.Add(ref vecRef, 4) - Unsafe.Add(ref vecRef, 10);
+ Vector4 tmp3 = Unsafe.Add(ref vecRef, 6) + Unsafe.Add(ref vecRef, 8);
+ Vector4 tmp4 = Unsafe.Add(ref vecRef, 6) - Unsafe.Add(ref vecRef, 8);
- ///
- /// Apply floating point IDCT inplace.
- /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
- ///
- /// Input matrix.
- /// Matrix to store temporal results.
- public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
- {
- block.TransposeInplace();
- IDCT8x8(ref block, ref temp);
- temp.TransposeInplace();
- IDCT8x8(ref temp, ref block);
+ // Even part
+ Vector4 tmp10 = tmp0 + tmp3;
+ Vector4 tmp13 = tmp0 - tmp3;
+ Vector4 tmp11 = tmp1 + tmp2;
+ Vector4 tmp12 = tmp1 - tmp2;
- // TODO: This can be fused into quantization table step
- block.MultiplyInPlace(C_0_125);
- }
+ Unsafe.Add(ref vecRef, 0) = tmp10 + tmp11;
+ Unsafe.Add(ref vecRef, 8) = tmp10 - tmp11;
- ///
- /// Performs 8x8 matrix Inverse Discrete Cosine Transform
- ///
- /// Source
- /// Destination
- private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- if (Avx.IsSupported)
- {
- IDCT8x8_Avx(ref s, ref d);
- }
- else
-#endif
- {
- IDCT8x4_LeftPart(ref s, ref d);
- IDCT8x4_RightPart(ref s, ref d);
- }
- }
+ Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
+ Unsafe.Add(ref vecRef, 4) = tmp13 + z1;
+ Unsafe.Add(ref vecRef, 12) = tmp13 - z1;
- ///
- /// Do IDCT internal operations on the left part of the block. Original src:
- /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
- ///
- /// The source block
- /// Destination block
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
- {
- Vector4 my1 = s.V1L;
- Vector4 my7 = s.V7L;
- Vector4 mz0 = my1 + my7;
-
- Vector4 my3 = s.V3L;
- Vector4 mz2 = my3 + my7;
- Vector4 my5 = s.V5L;
- Vector4 mz1 = my3 + my5;
- Vector4 mz3 = my1 + my5;
-
- Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
- mz2 = (mz2 * C_1_961571) + mz4;
- mz3 = (mz3 * C_0_390181) + mz4;
- mz0 = mz0 * C_0_899976;
- mz1 = mz1 * C_2_562915;
-
- Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
- Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
- Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
- Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
- Vector4 my2 = s.V2L;
- Vector4 my6 = s.V6L;
- mz4 = (my2 + my6) * C_0_541196;
- Vector4 my0 = s.V0L;
- Vector4 my4 = s.V4L;
- mz0 = my0 + my4;
- mz1 = my0 - my4;
-
- mz2 = mz4 + (my6 * C_1_847759);
- mz3 = mz4 + (my2 * C_0_765367);
-
- my0 = mz0 + mz3;
- my3 = mz0 - mz3;
- my1 = mz1 + mz2;
- my2 = mz1 - mz2;
-
- d.V0L = my0 + mb0;
- d.V7L = my0 - mb0;
- d.V1L = my1 + mb1;
- d.V6L = my1 - mb1;
- d.V2L = my2 + mb2;
- d.V5L = my2 - mb2;
- d.V3L = my3 + mb3;
- d.V4L = my3 - mb3;
- }
+ // Odd part
+ tmp10 = tmp4 + tmp5;
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
- ///
- /// Do IDCT internal operations on the right part of the block.
- /// Original src:
- /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
- ///
- /// The source block
- /// The destination block
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
- {
- Vector4 my1 = s.V1R;
- Vector4 my7 = s.V7R;
- Vector4 mz0 = my1 + my7;
-
- Vector4 my3 = s.V3R;
- Vector4 mz2 = my3 + my7;
- Vector4 my5 = s.V5R;
- Vector4 mz1 = my3 + my5;
- Vector4 mz3 = my1 + my5;
-
- Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
- mz2 = (mz2 * C_1_961571) + mz4;
- mz3 = (mz3 * C_0_390181) + mz4;
- mz0 = mz0 * C_0_899976;
- mz1 = mz1 * C_2_562915;
-
- Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
- Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
- Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
- Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
- Vector4 my2 = s.V2R;
- Vector4 my6 = s.V6R;
- mz4 = (my2 + my6) * C_0_541196;
- Vector4 my0 = s.V0R;
- Vector4 my4 = s.V4R;
- mz0 = my0 + my4;
- mz1 = my0 - my4;
-
- mz2 = mz4 + (my6 * C_1_847759);
- mz3 = mz4 + (my2 * C_0_765367);
-
- my0 = mz0 + mz3;
- my3 = mz0 - mz3;
- my1 = mz1 + mz2;
- my2 = mz1 - mz2;
-
- d.V0R = my0 + mb0;
- d.V7R = my0 - mb0;
- d.V1R = my1 + mb1;
- d.V6R = my1 - mb1;
- d.V2R = my2 + mb2;
- d.V5R = my2 - mb2;
- d.V3R = my3 + mb3;
- d.V4R = my3 - mb3;
+ Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
+ Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
+ Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
+ Vector4 z3 = tmp11 * mm128_F_0_7071;
+
+ Vector4 z11 = tmp7 + z3;
+ Vector4 z13 = tmp7 - z3;
+
+ Unsafe.Add(ref vecRef, 10) = z13 + z2;
+ Unsafe.Add(ref vecRef, 6) = z13 - z2;
+ Unsafe.Add(ref vecRef, 2) = z11 + z4;
+ Unsafe.Add(ref vecRef, 14) = z11 - z4;
+ }
}
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
index e519a8a1dc..ab80b3ae67 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
@@ -35,5 +35,34 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
63, 63, 63, 63, 63, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63
};
+
+ ///
+ /// Gets span of zig-zag with fused transpose step ordering indices.
+ ///
+ ///
+ /// When reading corrupted data, the Huffman decoders could attempt
+ /// to reference an entry beyond the end of this array (if the decoded
+ /// zero run length reaches past the end of the block). To prevent
+ /// wild stores without adding an inner-loop test, we put some extra
+ /// "63"s after the real entries. This will cause the extra coefficient
+ /// to be stored in location 63 of the block, not somewhere random.
+ /// The worst case would be a run-length of 15, which means we need 16
+ /// fake entries.
+ ///
+ public static ReadOnlySpan TransposingOrder => new byte[]
+ {
+ 0, 8, 1, 2, 9, 16, 24, 17,
+ 10, 3, 4, 11, 18, 25, 32, 40,
+ 33, 26, 19, 12, 5, 6, 13, 20,
+ 27, 34, 41, 48, 56, 49, 42, 35,
+ 28, 21, 14, 7, 15, 22, 29, 36,
+ 43, 50, 57, 58, 51, 44, 37, 30,
+ 23, 31, 38, 45, 52, 59, 60, 53,
+ 46, 39, 47, 54, 61, 62, 55, 63,
+
+ // Extra entries for safety in decoder
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63
+ };
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
index 9a9e5eb799..73763f4ab8 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
@@ -942,6 +942,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
break;
}
}
+
+ // Adjusting table for IDCT step during decompression
+ FastFloatingPointDCT.AdjustToIDCT(ref table);
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index d9d42e0614..abe59516fa 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -288,8 +288,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg
/// The number of components to write.
private void WriteDefineHuffmanTables(int componentCount)
{
+ // This uses a C#'s compiler optimization that refers to the static data segment of the assembly,
+ // and doesn't incur any allocation at all.
// Table identifiers.
- ReadOnlySpan headers = stackalloc byte[]
+ ReadOnlySpan headers = new byte[]
{
0x00,
0x10,
diff --git a/src/ImageSharp/Formats/Tiff/Compression/Compressors/TiffLzwEncoder.cs b/src/ImageSharp/Formats/Tiff/Compression/Compressors/TiffLzwEncoder.cs
index baeabdbb20..d4d1d1cb65 100644
--- a/src/ImageSharp/Formats/Tiff/Compression/Compressors/TiffLzwEncoder.cs
+++ b/src/ImageSharp/Formats/Tiff/Compression/Compressors/TiffLzwEncoder.cs
@@ -256,8 +256,8 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Compressors
private void ResetTables()
{
- this.children.GetSpan().Fill(0);
- this.siblings.GetSpan().Fill(0);
+ this.children.GetSpan().Clear();
+ this.siblings.GetSpan().Clear();
this.bitsPerCode = MinBits;
this.maxCode = MaxValue(this.bitsPerCode);
this.nextValidCode = EoiCode + 1;
diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/T6TiffCompression.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/T6TiffCompression.cs
index e86418741d..972f4d8ff1 100644
--- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/T6TiffCompression.cs
+++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/T6TiffCompression.cs
@@ -64,7 +64,7 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors
uint bitsWritten = 0;
for (int y = 0; y < height; y++)
{
- scanLine.Fill(0);
+ scanLine.Clear();
Decode2DScanline(bitReader, this.isWhiteZero, referenceScanLine, scanLine);
bitsWritten = this.WriteScanLine(buffer, scanLine, bitsWritten);
@@ -116,7 +116,15 @@ namespace SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors
{
// If a TIFF reader encounters EOFB before the expected number of lines has been extracted,
// it is appropriate to assume that the missing rows consist entirely of white pixels.
- scanline.Fill(whiteIsZero ? (byte)0 : (byte)255);
+ if (whiteIsZero)
+ {
+ scanline.Clear();
+ }
+ else
+ {
+ scanline.Fill((byte)255);
+ }
+
break;
}
diff --git a/src/ImageSharp/Formats/Webp/EntropyIx.cs b/src/ImageSharp/Formats/Webp/EntropyIx.cs
index c72ddeb42d..98e8b7e164 100644
--- a/src/ImageSharp/Formats/Webp/EntropyIx.cs
+++ b/src/ImageSharp/Formats/Webp/EntropyIx.cs
@@ -6,7 +6,7 @@ namespace SixLabors.ImageSharp.Formats.Webp
///
/// These five modes are evaluated and their respective entropy is computed.
///
- internal enum EntropyIx
+ internal enum EntropyIx : byte
{
Direct = 0,
diff --git a/src/ImageSharp/Formats/Webp/HistoIx.cs b/src/ImageSharp/Formats/Webp/HistoIx.cs
index 68b00394b0..83522f9da8 100644
--- a/src/ImageSharp/Formats/Webp/HistoIx.cs
+++ b/src/ImageSharp/Formats/Webp/HistoIx.cs
@@ -3,7 +3,7 @@
namespace SixLabors.ImageSharp.Formats.Webp
{
- internal enum HistoIx
+ internal enum HistoIx : byte
{
HistoAlpha = 0,
diff --git a/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs
index dc546f8ac2..c394a8caa8 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/BackwardReferenceEncoder.cs
@@ -2,11 +2,13 @@
// Licensed under the Apache License, Version 2.0.
using System;
+using System.Buffers;
using System.Collections.Generic;
+using SixLabors.ImageSharp.Memory;
namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
- internal class BackwardReferenceEncoder
+ internal static class BackwardReferenceEncoder
{
///
/// Maximum bit length.
@@ -41,6 +43,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
int quality,
int lz77TypesToTry,
ref int cacheBits,
+ MemoryAllocator memoryAllocator,
Vp8LHashChain hashChain,
Vp8LBackwardRefs best,
Vp8LBackwardRefs worst)
@@ -69,7 +72,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
BackwardReferencesLz77(width, height, bgra, 0, hashChain, worst);
break;
case Vp8LLz77Type.Lz77Box:
- hashChainBox = new Vp8LHashChain(width * height);
+ hashChainBox = new Vp8LHashChain(memoryAllocator, width * height);
BackwardReferencesLz77Box(width, height, bgra, 0, hashChain, hashChainBox, worst);
break;
}
@@ -100,7 +103,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
if ((lz77TypeBest == (int)Vp8LLz77Type.Lz77Standard || lz77TypeBest == (int)Vp8LLz77Type.Lz77Box) && quality >= 25)
{
Vp8LHashChain hashChainTmp = lz77TypeBest == (int)Vp8LLz77Type.Lz77Standard ? hashChain : hashChainBox;
- BackwardReferencesTraceBackwards(width, height, bgra, cacheBits, hashChainTmp, best, worst);
+ BackwardReferencesTraceBackwards(width, height, memoryAllocator, bgra, cacheBits, hashChainTmp, best, worst);
var histo = new Vp8LHistogram(worst, cacheBits);
double bitCostTrace = histo.EstimateBits(stats, bitsEntropy);
if (bitCostTrace < bitCostBest)
@@ -111,6 +114,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
BackwardReferences2DLocality(width, best);
+ hashChainBox?.Dispose();
+
return best;
}
@@ -234,6 +239,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
private static void BackwardReferencesTraceBackwards(
int xSize,
int ySize,
+ MemoryAllocator memoryAllocator,
ReadOnlySpan bgra,
int cacheBits,
Vp8LHashChain hashChain,
@@ -241,22 +247,24 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
Vp8LBackwardRefs refsDst)
{
int distArraySize = xSize * ySize;
- ushort[] distArray = new ushort[distArraySize];
+ using IMemoryOwner distArrayBuffer = memoryAllocator.Allocate(distArraySize);
+ Span distArray = distArrayBuffer.GetSpan();
- BackwardReferencesHashChainDistanceOnly(xSize, ySize, bgra, cacheBits, hashChain, refsSrc, distArray);
+ BackwardReferencesHashChainDistanceOnly(xSize, ySize, memoryAllocator, bgra, cacheBits, hashChain, refsSrc, distArrayBuffer);
int chosenPathSize = TraceBackwards(distArray, distArraySize);
- Span chosenPath = distArray.AsSpan(distArraySize - chosenPathSize);
+ Span chosenPath = distArray.Slice(distArraySize - chosenPathSize);
BackwardReferencesHashChainFollowChosenPath(bgra, cacheBits, chosenPath, chosenPathSize, hashChain, refsDst);
}
private static void BackwardReferencesHashChainDistanceOnly(
int xSize,
int ySize,
+ MemoryAllocator memoryAllocator,
ReadOnlySpan bgra,
int cacheBits,
Vp8LHashChain hashChain,
Vp8LBackwardRefs refs,
- ushort[] distArray)
+ IMemoryOwner distArrayBuffer)
{
int pixCount = xSize * ySize;
bool useColorCache = cacheBits > 0;
@@ -275,22 +283,24 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
costModel.Build(xSize, cacheBits, refs);
- var costManager = new CostManager(distArray, pixCount, costModel);
+ using var costManager = new CostManager(memoryAllocator, distArrayBuffer, pixCount, costModel);
+ Span costManagerCosts = costManager.Costs.GetSpan();
+ Span distArray = distArrayBuffer.GetSpan();
// We loop one pixel at a time, but store all currently best points to non-processed locations from this point.
distArray[0] = 0;
// Add first pixel as literal.
- AddSingleLiteralWithCostModel(bgra, colorCache, costModel, 0, useColorCache, 0.0f, costManager.Costs, distArray);
+ AddSingleLiteralWithCostModel(bgra, colorCache, costModel, 0, useColorCache, 0.0f, costManagerCosts, distArray);
for (int i = 1; i < pixCount; i++)
{
- float prevCost = costManager.Costs[i - 1];
+ float prevCost = costManagerCosts[i - 1];
int offset = hashChain.FindOffset(i);
int len = hashChain.FindLength(i);
// Try adding the pixel as a literal.
- AddSingleLiteralWithCostModel(bgra, colorCache, costModel, i, useColorCache, prevCost, costManager.Costs, distArray);
+ AddSingleLiteralWithCostModel(bgra, colorCache, costModel, i, useColorCache, prevCost, costManagerCosts, distArray);
// If we are dealing with a non-literal.
if (len >= 2)
@@ -334,7 +344,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
costManager.UpdateCostAtIndex(j - 1, false);
costManager.UpdateCostAtIndex(j, false);
- costManager.PushInterval(costManager.Costs[j - 1] + offsetCost, j, lenJ);
+ costManager.PushInterval(costManagerCosts[j - 1] + offsetCost, j, lenJ);
reach = j + lenJ - 1;
}
}
@@ -346,7 +356,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
}
- private static int TraceBackwards(ushort[] distArray, int distArraySize)
+ private static int TraceBackwards(Span distArray, int distArraySize)
{
int chosenPathSize = 0;
int pathPos = distArraySize;
@@ -426,8 +436,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
int idx,
bool useColorCache,
float prevCost,
- float[] cost,
- ushort[] distArray)
+ Span cost,
+ Span distArray)
{
double costVal = prevCost;
uint color = bgra[idx];
@@ -617,7 +627,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
}
- hashChain.OffsetLength[0] = 0;
+ Span hashChainOffsetLength = hashChain.OffsetLength.GetSpan();
+ hashChainOffsetLength[0] = 0;
for (i = 1; i < pixelCount; i++)
{
int ind;
@@ -695,19 +706,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
if (bestLength <= MinLength)
{
- hashChain.OffsetLength[i] = 0;
+ hashChainOffsetLength[i] = 0;
bestOffsetPrev = 0;
bestLengthPrev = 0;
}
else
{
- hashChain.OffsetLength[i] = (uint)((bestOffset << MaxLengthBits) | bestLength);
+ hashChainOffsetLength[i] = (uint)((bestOffset << MaxLengthBits) | bestLength);
bestOffsetPrev = bestOffset;
bestLengthPrev = bestLength;
}
}
- hashChain.OffsetLength[0] = 0;
+ hashChainOffsetLength[0] = 0;
BackwardReferencesLz77(xSize, ySize, bgra, cacheBits, hashChain, refs);
}
diff --git a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
new file mode 100644
index 0000000000..71f3c5ca9e
--- /dev/null
+++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
@@ -0,0 +1,268 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
+
+namespace SixLabors.ImageSharp.Formats.Webp.Lossless
+{
+ internal static class ColorSpaceTransformUtils
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ private static readonly Vector128 CollectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte();
+
+ private static readonly Vector128 CollectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte();
+
+ private static readonly Vector256 CollectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte();
+
+ private static readonly Vector256 CollectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte();
+
+ private static readonly Vector128 CollectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
+ private static readonly Vector128 CollectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+
+ private static readonly Vector128 CollectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+
+ private static readonly Vector128 CollectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
+
+ private static readonly Vector128 CollectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
+
+ private static readonly Vector256 CollectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
+
+ private static readonly Vector256 CollectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
+
+ private static readonly Vector256 CollectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+
+ private static readonly Vector256 CollectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+
+ private static readonly Vector256 CollectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+#endif
+
+ public static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo)
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx2.IsSupported && tileWidth >= 16)
+ {
+ const int span = 16;
+ Span values = stackalloc ushort[span];
+ var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
+ var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
+ for (int y = 0; y < tileHeight; y++)
+ {
+ Span srcSpan = bgra.Slice(y * stride);
+ ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan);
+ for (nint x = 0; x <= tileWidth - span; x += span)
+ {
+ nint input0Idx = x;
+ nint input1Idx = x + (span / 2);
+ Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
+ Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
+ Vector256 r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256);
+ Vector256 r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256);
+ Vector256 r = Avx2.Or(r0, r1);
+ Vector256 gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256);
+ Vector256 gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256);
+ Vector256 gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
+ Vector256 g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256);
+ Vector256 a = Avx2.MultiplyHigh(r.AsInt16(), multsr);
+ Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg);
+ Vector256 c = Avx2.Subtract(gb.AsByte(), b.AsByte());
+ Vector256 d = Avx2.Subtract(c, a.AsByte());
+ Vector256 e = Avx2.And(d, CollectColorBlueTransformsBlueMask256);
+
+ ref ushort outputRef = ref MemoryMarshal.GetReference(values);
+ Unsafe.As>(ref outputRef) = e.AsUInt16();
+
+ for (int i = 0; i < span; i++)
+ {
+ ++histo[values[i]];
+ }
+ }
+ }
+
+ int leftOver = tileWidth & (span - 1);
+ if (leftOver > 0)
+ {
+ CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
+ }
+ }
+ else if (Sse41.IsSupported)
+ {
+ const int span = 8;
+ Span values = stackalloc ushort[span];
+ var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
+ var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
+ for (int y = 0; y < tileHeight; y++)
+ {
+ Span srcSpan = bgra.Slice(y * stride);
+ ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan);
+ for (nint x = 0; x <= tileWidth - span; x += span)
+ {
+ nint input0Idx = x;
+ nint input1Idx = x + (span / 2);
+ Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
+ Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
+ Vector128 r0 = Ssse3.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask);
+ Vector128 r1 = Ssse3.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask);
+ Vector128 r = Sse2.Or(r0, r1);
+ Vector128 gb0 = Sse2.And(input0, CollectColorBlueTransformsGreenBlueMask);
+ Vector128 gb1 = Sse2.And(input1, CollectColorBlueTransformsGreenBlueMask);
+ Vector128 gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
+ Vector128 g = Sse2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask);
+ Vector128 a = Sse2.MultiplyHigh(r.AsInt16(), multsr);
+ Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg);
+ Vector128 c = Sse2.Subtract(gb.AsByte(), b.AsByte());
+ Vector128 d = Sse2.Subtract(c, a.AsByte());
+ Vector128 e = Sse2.And(d, CollectColorBlueTransformsBlueMask);
+
+ ref ushort outputRef = ref MemoryMarshal.GetReference(values);
+ Unsafe.As>(ref outputRef) = e.AsUInt16();
+
+ for (int i = 0; i < span; i++)
+ {
+ ++histo[values[i]];
+ }
+ }
+ }
+
+ int leftOver = tileWidth & (span - 1);
+ if (leftOver > 0)
+ {
+ CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
+ }
+ }
+ else
+#endif
+ {
+ CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
+ }
+ }
+
+ private static void CollectColorBlueTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo)
+ {
+ int pos = 0;
+ while (tileHeight-- > 0)
+ {
+ for (int x = 0; x < tileWidth; x++)
+ {
+ int idx = LosslessUtils.TransformColorBlue((sbyte)greenToBlue, (sbyte)redToBlue, bgra[pos + x]);
+ ++histo[idx];
+ }
+
+ pos += stride;
+ }
+ }
+
+ public static void CollectColorRedTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo)
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx2.IsSupported && tileWidth >= 16)
+ {
+ var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
+ const int span = 16;
+ Span values = stackalloc ushort[span];
+ for (int y = 0; y < tileHeight; y++)
+ {
+ Span srcSpan = bgra.Slice(y * stride);
+ ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan);
+ for (nint x = 0; x <= tileWidth - span; x += span)
+ {
+ nint input0Idx = x;
+ nint input1Idx = x + (span / 2);
+ Vector256 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
+ Vector256 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
+ Vector256 g0 = Avx2.And(input0, CollectColorRedTransformsGreenMask256); // 0 0 | g 0
+ Vector256 g1 = Avx2.And(input1, CollectColorRedTransformsGreenMask256);
+ Vector256 g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
+ Vector256 a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
+ Vector256 a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16);
+ Vector256 a = Avx2.PackUnsignedSaturate(a0, a1); // x r
+ Vector256 b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr
+ Vector256 c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r'
+ Vector256 d = Avx2.And(c, CollectColorRedTransformsAndMask256); // 0 r'
+
+ ref ushort outputRef = ref MemoryMarshal.GetReference(values);
+ Unsafe.As>(ref outputRef) = d.AsUInt16();
+
+ for (int i = 0; i < span; i++)
+ {
+ ++histo[values[i]];
+ }
+ }
+ }
+
+ int leftOver = tileWidth & (span - 1);
+ if (leftOver > 0)
+ {
+ CollectColorRedTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToRed, histo);
+ }
+ }
+ else if (Sse41.IsSupported)
+ {
+ var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
+ const int span = 8;
+ Span values = stackalloc ushort[span];
+ for (int y = 0; y < tileHeight; y++)
+ {
+ Span srcSpan = bgra.Slice(y * stride);
+ ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan);
+ for (nint x = 0; x <= tileWidth - span; x += span)
+ {
+ nint input0Idx = x;
+ nint input1Idx = x + (span / 2);
+ Vector128 input0 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
+ Vector128 input1 = Unsafe.As>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
+ Vector128 g0 = Sse2.And(input0, CollectColorRedTransformsGreenMask); // 0 0 | g 0
+ Vector128 g1 = Sse2.And(input1, CollectColorRedTransformsGreenMask);
+ Vector128 g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
+ Vector128 a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
+ Vector128 a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16);
+ Vector128 a = Sse41.PackUnsignedSaturate(a0, a1); // x r
+ Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr
+ Vector128 c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r'
+ Vector128 d = Sse2.And(c, CollectColorRedTransformsAndMask); // 0 r'
+
+ ref ushort outputRef = ref MemoryMarshal.GetReference(values);
+ Unsafe.As>(ref outputRef) = d.AsUInt16();
+
+ for (int i = 0; i < span; i++)
+ {
+ ++histo[values[i]];
+ }
+ }
+ }
+
+ int leftOver = tileWidth & (span - 1);
+ if (leftOver > 0)
+ {
+ CollectColorRedTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToRed, histo);
+ }
+ }
+ else
+#endif
+ {
+ CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
+ }
+ }
+
+ private static void CollectColorRedTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo)
+ {
+ int pos = 0;
+ while (tileHeight-- > 0)
+ {
+ for (int x = 0; x < tileWidth; x++)
+ {
+ int idx = LosslessUtils.TransformColorRed((sbyte)greenToRed, bgra[pos + x]);
+ ++histo[idx];
+ }
+
+ pos += stride;
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs b/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs
index 94c7bd8470..c121a41a1a 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/CostManager.cs
@@ -1,7 +1,10 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
+using System;
+using System.Buffers;
using System.Collections.Generic;
+using SixLabors.ImageSharp.Memory;
namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
@@ -10,20 +13,29 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
/// It caches the different CostCacheInterval, caches the different
/// GetLengthCost(costModel, k) in costCache and the CostInterval's.
///
- internal class CostManager
+ internal sealed class CostManager : IDisposable
{
private CostInterval head;
- public CostManager(ushort[] distArray, int pixCount, CostModel costModel)
+ private const int FreeIntervalsStartCount = 25;
+
+ private readonly Stack freeIntervals = new(FreeIntervalsStartCount);
+
+ public CostManager(MemoryAllocator memoryAllocator, IMemoryOwner distArray, int pixCount, CostModel costModel)
{
int costCacheSize = pixCount > BackwardReferenceEncoder.MaxLength ? BackwardReferenceEncoder.MaxLength : pixCount;
this.CacheIntervals = new List();
this.CostCache = new List();
- this.Costs = new float[pixCount];
+ this.Costs = memoryAllocator.Allocate(pixCount);
this.DistArray = distArray;
this.Count = 0;
+ for (int i = 0; i < FreeIntervalsStartCount; i++)
+ {
+ this.freeIntervals.Push(new CostInterval());
+ }
+
// Fill in the cost cache.
this.CacheIntervalsSize++;
this.CostCache.Add(costModel.GetLengthCost(0));
@@ -64,10 +76,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
// Set the initial costs high for every pixel as we will keep the minimum.
- for (int i = 0; i < pixCount; i++)
- {
- this.Costs[i] = 1e38f;
- }
+ this.Costs.GetSpan().Fill(1e38f);
}
///
@@ -82,9 +91,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
public int CacheIntervalsSize { get; }
- public float[] Costs { get; }
+ public IMemoryOwner Costs { get; }
- public ushort[] DistArray { get; }
+ public IMemoryOwner DistArray { get; }
public List CacheIntervals { get; }
@@ -128,6 +137,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
// interval logic, just serialize it right away. This constant is empirical.
int skipDistance = 10;
+ Span costs = this.Costs.GetSpan();
+ Span distArray = this.DistArray.GetSpan();
if (len < skipDistance)
{
for (int j = position; j < position + len; j++)
@@ -135,10 +146,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
int k = j - position;
float costTmp = (float)(distanceCost + this.CostCache[k]);
- if (this.Costs[j] > costTmp)
+ if (costs[j] > costTmp)
{
- this.Costs[j] = costTmp;
- this.DistArray[j] = (ushort)(k + 1);
+ costs[j] = costTmp;
+ distArray[j] = (ushort)(k + 1);
}
}
@@ -201,10 +212,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
this.InsertInterval(interval, interval.Cost, interval.Index, end, endOriginal);
break;
}
- else
- {
- interval.End = start;
- }
+
+ interval.End = start;
}
}
@@ -226,6 +235,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
this.ConnectIntervals(interval.Previous, interval.Next);
this.Count--;
+
+ interval.Next = null;
+ interval.Previous = null;
+ this.freeIntervals.Push(interval);
}
private void InsertInterval(CostInterval intervalIn, float cost, int position, int start, int end)
@@ -236,13 +249,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
// TODO: should we use COST_CACHE_INTERVAL_SIZE_MAX?
- var intervalNew = new CostInterval()
+ CostInterval intervalNew;
+ if (this.freeIntervals.Count > 0)
{
- Cost = cost,
- Start = start,
- End = end,
- Index = position
- };
+ intervalNew = this.freeIntervals.Pop();
+ intervalNew.Cost = cost;
+ intervalNew.Start = start;
+ intervalNew.End = end;
+ intervalNew.Index = position;
+ }
+ else
+ {
+ intervalNew = new CostInterval() { Cost = cost, Start = start, End = end, Index = position };
+ }
this.PositionOrphanInterval(intervalNew, intervalIn);
this.Count++;
@@ -297,12 +316,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
///
private void UpdateCost(int i, int position, float cost)
{
+ Span costs = this.Costs.GetSpan();
+ Span distArray = this.DistArray.GetSpan();
int k = i - position;
- if (this.Costs[i] > cost)
+ if (costs[i] > cost)
{
- this.Costs[i] = cost;
- this.DistArray[i] = (ushort)(k + 1);
+ costs[i] = cost;
+ distArray[i] = (ushort)(k + 1);
}
}
+
+ ///
+ public void Dispose() => this.Costs.Dispose();
}
}
diff --git a/src/ImageSharp/Formats/Webp/Lossless/CostModel.cs b/src/ImageSharp/Formats/Webp/Lossless/CostModel.cs
index 7f4d0307bc..bdaf30dc9c 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/CostModel.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/CostModel.cs
@@ -87,7 +87,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
if (nonzeros <= 1)
{
- output.AsSpan(0, numSymbols).Fill(0);
+ output.AsSpan(0, numSymbols).Clear();
}
else
{
diff --git a/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs b/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs
index a038248f1a..6c2217eb6e 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HTreeGroup.cs
@@ -13,16 +13,16 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
/// - UsePackedTable: few enough literal symbols, so all the bit codes can fit into a small look-up table PackedTable[]
/// The common literal base, if applicable, is stored in 'LiteralArb'.
///
- internal class HTreeGroup
+ internal struct HTreeGroup
{
public HTreeGroup(uint packedTableSize)
{
this.HTrees = new List(WebpConstants.HuffmanCodesPerMetaCode);
this.PackedTable = new HuffmanCode[packedTableSize];
- for (int i = 0; i < packedTableSize; i++)
- {
- this.PackedTable[i] = new HuffmanCode();
- }
+ this.IsTrivialCode = false;
+ this.IsTrivialLiteral = false;
+ this.LiteralArb = 0;
+ this.UsePackedTable = false;
}
///
diff --git a/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs
index 5d407d73c1..b52f8eb5d5 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HistogramEncoder.cs
@@ -287,7 +287,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
// Create a mapping from a cluster id to its minimal version.
int clusterMax = 0;
- clusterMappingsTmp.AsSpan().Fill(0);
+ clusterMappingsTmp.AsSpan().Clear();
// Re-map the ids.
for (int i = 0; i < symbols.Length; i++)
diff --git a/src/ImageSharp/Formats/Webp/Lossless/HuffmanCode.cs b/src/ImageSharp/Formats/Webp/Lossless/HuffmanCode.cs
index f75c64de11..efb9283568 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanCode.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanCode.cs
@@ -9,7 +9,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
/// A classic way to do entropy coding where a smaller number of bits are used for more frequent codes.
///
[DebuggerDisplay("BitsUsed: {BitsUsed}, Value: {Value}")]
- internal class HuffmanCode
+ internal struct HuffmanCode
{
///
/// Gets or sets the number of bits used for this symbol.
diff --git a/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs b/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs
index 0376311ed9..07fec7f990 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanTree.cs
@@ -9,7 +9,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
/// Represents the Huffman tree.
///
[DebuggerDisplay("TotalCount = {TotalCount}, Value = {Value}, Left = {PoolIndexLeft}, Right = {PoolIndexRight}")]
- internal struct HuffmanTree : IDeepCloneable
+ internal struct HuffmanTree
{
///
/// Initializes a new instance of the struct.
@@ -57,7 +57,5 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
return t1.Value < t2.Value ? -1 : 1;
}
-
- public IDeepCloneable DeepClone() => new HuffmanTree(this);
}
}
diff --git a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs
index 3c81f1a22c..56f2ee9cef 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/HuffmanUtils.cs
@@ -2,6 +2,7 @@
// Licensed under the Apache License, Version 2.0.
using System;
+using System.Runtime.CompilerServices;
namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
@@ -28,7 +29,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
public static void CreateHuffmanTree(uint[] histogram, int treeDepthLimit, bool[] bufRle, HuffmanTree[] huffTree, HuffmanTreeCode huffCode)
{
int numSymbols = huffCode.NumSymbols;
- bufRle.AsSpan().Fill(false);
+ bufRle.AsSpan().Clear();
OptimizeHuffmanForRle(numSymbols, bufRle, histogram);
GenerateOptimalTree(huffTree, histogram, numSymbols, treeDepthLimit, huffCode.CodeLengths);
@@ -218,8 +219,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
while (treeSize > 1)
{
// Finish when we have only one root.
- treePool[treePoolSize++] = (HuffmanTree)tree[treeSize - 1].DeepClone();
- treePool[treePoolSize++] = (HuffmanTree)tree[treeSize - 2].DeepClone();
+ treePool[treePoolSize++] = tree[treeSize - 1];
+ treePool[treePoolSize++] = tree[treeSize - 2];
int count = treePool[treePoolSize - 1].TotalCount + treePool[treePoolSize - 2].TotalCount;
treeSize -= 2;
@@ -238,7 +239,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
int startIdx = endIdx + num - 1;
for (int i = startIdx; i >= endIdx; i--)
{
- tree[i] = (HuffmanTree)tree[i - 1].DeepClone();
+ tree[i] = tree[i - 1];
}
tree[k].TotalCount = count;
@@ -307,9 +308,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
public static int BuildHuffmanTable(Span table, int rootBits, int[] codeLengths, int codeLengthsSize)
{
- Guard.MustBeGreaterThan(rootBits, 0, nameof(rootBits));
- Guard.NotNull(codeLengths, nameof(codeLengths));
- Guard.MustBeGreaterThan(codeLengthsSize, 0, nameof(codeLengthsSize));
+ DebugGuard.MustBeGreaterThan(rootBits, 0, nameof(rootBits));
+ DebugGuard.NotNull(codeLengths, nameof(codeLengths));
+ DebugGuard.MustBeGreaterThan(codeLengthsSize, 0, nameof(codeLengthsSize));
// sorted[codeLengthsSize] is a pre-allocated array for sorting symbols by code length.
int[] sorted = new int[codeLengthsSize];
@@ -467,27 +468,27 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
break;
}
- else if (repetitions < 11)
+
+ if (repetitions < 11)
{
tokens[pos].Code = 17;
tokens[pos].ExtraBits = (byte)(repetitions - 3);
pos++;
break;
}
- else if (repetitions < 139)
+
+ if (repetitions < 139)
{
tokens[pos].Code = 18;
tokens[pos].ExtraBits = (byte)(repetitions - 11);
pos++;
break;
}
- else
- {
- tokens[pos].Code = 18;
- tokens[pos].ExtraBits = 0x7f; // 138 repeated 0s
- pos++;
- repetitions -= 138;
- }
+
+ tokens[pos].Code = 18;
+ tokens[pos].ExtraBits = 0x7f; // 138 repeated 0s
+ pos++;
+ repetitions -= 138;
}
return pos;
@@ -519,20 +520,19 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
break;
}
- else if (repetitions < 7)
+
+ if (repetitions < 7)
{
tokens[pos].Code = 16;
tokens[pos].ExtraBits = (byte)(repetitions - 3);
pos++;
break;
}
- else
- {
- tokens[pos].Code = 16;
- tokens[pos].ExtraBits = 3;
- pos++;
- repetitions -= 6;
- }
+
+ tokens[pos].Code = 16;
+ tokens[pos].ExtraBits = 3;
+ pos++;
+ repetitions -= 6;
}
return pos;
@@ -541,7 +541,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
///
/// Get the actual bit values for a tree of bit depths.
///
- /// The hiffman tree.
+ /// The huffman tree.
private static void ConvertBitDepthsToSymbols(HuffmanTreeCode tree)
{
// 0 bit-depth means that the symbol does not exist.
@@ -628,7 +628,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
///
private static void ReplicateValue(Span table, int step, int end, HuffmanCode code)
{
- Guard.IsTrue(end % step == 0, nameof(end), "end must be a multiple of step");
+ DebugGuard.IsTrue(end % step == 0, nameof(end), "end must be a multiple of step");
do
{
@@ -656,6 +656,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
///
/// Heuristics for selecting the stride ranges to collapse.
///
+ [MethodImpl(InliningOptions.ShortMethod)]
private static bool ValuesShouldBeCollapsedToStrideAverage(int a, int b) => Math.Abs(a - b) < 4;
}
}
diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index f9b97c6c44..471c083cda 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -42,12 +42,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
private static readonly Vector128 TransformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ private static readonly Vector256 TransformColorAlphaGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
private static readonly Vector128 TransformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+ private static readonly Vector256 TransformColorRedBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+
private static readonly byte TransformColorShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
private static readonly Vector128 TransformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+ private static readonly Vector256 TransformColorInverseAlphaGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+
private static readonly byte TransformColorInverseShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0);
#endif
@@ -122,76 +128,67 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
if (Avx2.IsSupported)
{
int numPixels = pixelData.Length;
- fixed (uint* p = pixelData)
+ nint i;
+ for (i = 0; i <= numPixels - 8; i += 8)
{
- int i;
- for (i = 0; i + 8 <= numPixels; i += 8)
- {
- uint* idx = p + i;
- Vector256 input = Avx.LoadVector256((ushort*)idx).AsByte();
- Vector256 in0g0g = Avx2.Shuffle(input, AddGreenToBlueAndRedMaskAvx2);
- Vector256 output = Avx2.Add(input, in0g0g);
- Avx.Store((byte*)idx, output);
- }
+ ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
+ Vector256 input = Unsafe.As>(ref pos).AsByte();
+ Vector256 in0g0g = Avx2.Shuffle(input, AddGreenToBlueAndRedMaskAvx2);
+ Vector256 output = Avx2.Add(input, in0g0g);
+ Unsafe.As>(ref pos) = output.AsUInt32();
+ }
- if (i != numPixels)
- {
- AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i));
- }
+ if (i != numPixels)
+ {
+ AddGreenToBlueAndRedScalar(pixelData.Slice((int)i));
}
}
else if (Ssse3.IsSupported)
{
int numPixels = pixelData.Length;
- fixed (uint* p = pixelData)
+ nint i;
+ for (i = 0; i <= numPixels - 4; i += 4)
{
- int i;
- for (i = 0; i + 4 <= numPixels; i += 4)
- {
- uint* idx = p + i;
- Vector128 input = Sse2.LoadVector128((ushort*)idx).AsByte();
- Vector128 in0g0g = Ssse3.Shuffle(input, AddGreenToBlueAndRedMaskSsse3);
- Vector128 output = Sse2.Add(input, in0g0g);
- Sse2.Store((byte*)idx, output.AsByte());
- }
+ ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
+ Vector128 input = Unsafe.As>(ref pos).AsByte();
+ Vector128 in0g0g = Ssse3.Shuffle(input, AddGreenToBlueAndRedMaskSsse3);
+ Vector128 output = Sse2.Add(input, in0g0g);
+ Unsafe.As>(ref pos) = output.AsUInt32();
+ }
- if (i != numPixels)
- {
- AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i));
- }
+ if (i != numPixels)
+ {
+ AddGreenToBlueAndRedScalar(pixelData.Slice((int)i));
}
}
else if (Sse2.IsSupported)
{
int numPixels = pixelData.Length;
- fixed (uint* p = pixelData)
+ nint i;
+ for (i = 0; i <= numPixels - 4; i += 4)
{
- int i;
- for (i = 0; i + 4 <= numPixels; i += 4)
- {
- uint* idx = p + i;
- Vector128 input = Sse2.LoadVector128((ushort*)idx);
- Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
- Vector128 b = Sse2.ShuffleLow(a, AddGreenToBlueAndRedShuffleMask);
- Vector128 c = Sse2.ShuffleHigh(b, AddGreenToBlueAndRedShuffleMask); // 0g0g
- Vector128 output = Sse2.Add(input.AsByte(), c.AsByte());
- Sse2.Store((byte*)idx, output);
- }
+ ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
+ Vector128 input = Unsafe.As>(ref pos).AsByte();
+ Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
+ Vector128 b = Sse2.ShuffleLow(a, AddGreenToBlueAndRedShuffleMask);
+ Vector128 c = Sse2.ShuffleHigh(b, AddGreenToBlueAndRedShuffleMask); // 0g0g
+ Vector128 output = Sse2.Add(input.AsByte(), c.AsByte());
+ Unsafe.As>(ref pos) = output.AsUInt32();
+ }
- if (i != numPixels)
- {
- AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i));
- }
+ if (i != numPixels)
+ {
+ AddGreenToBlueAndRedScalar(pixelData.Slice((int)i));
}
}
else
#endif
{
- AddGreenToBlueAndRedNoneVectorized(pixelData);
+ AddGreenToBlueAndRedScalar(pixelData);
}
}
- private static void AddGreenToBlueAndRedNoneVectorized(Span pixelData)
+ private static void AddGreenToBlueAndRedScalar(Span pixelData)
{
int numPixels = pixelData.Length;
for (int i = 0; i < numPixels; i++)
@@ -211,76 +208,67 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
if (Avx2.IsSupported)
{
int numPixels = pixelData.Length;
- fixed (uint* p = pixelData)
+ nint i;
+ for (i = 0; i <= numPixels - 8; i += 8)
{
- int i;
- for (i = 0; i + 8 <= numPixels; i += 8)
- {
- uint* idx = p + i;
- Vector256 input = Avx.LoadVector256((ushort*)idx).AsByte();
- Vector256 in0g0g = Avx2.Shuffle(input, SubtractGreenFromBlueAndRedMaskAvx2);
- Vector256 output = Avx2.Subtract(input, in0g0g);
- Avx.Store((byte*)idx, output);
- }
+ ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
+ Vector256 input = Unsafe.As>(ref pos).AsByte();
+ Vector256 in0g0g = Avx2.Shuffle(input, SubtractGreenFromBlueAndRedMaskAvx2);
+ Vector256 output = Avx2.Subtract(input, in0g0g);
+ Unsafe.As>(ref pos) = output.AsUInt32();
+ }
- if (i != numPixels)
- {
- SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i));
- }
+ if (i != numPixels)
+ {
+ SubtractGreenFromBlueAndRedScalar(pixelData.Slice((int)i));
}
}
else if (Ssse3.IsSupported)
{
int numPixels = pixelData.Length;
- fixed (uint* p = pixelData)
+ nint i;
+ for (i = 0; i <= numPixels - 4; i += 4)
{
- int i;
- for (i = 0; i + 4 <= numPixels; i += 4)
- {
- uint* idx = p + i;
- Vector128 input = Sse2.LoadVector128((ushort*)idx).AsByte();
- Vector128 in0g0g = Ssse3.Shuffle(input, SubtractGreenFromBlueAndRedMaskSsse3);
- Vector128 output = Sse2.Subtract(input, in0g0g);
- Sse2.Store((byte*)idx, output.AsByte());
- }
+ ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
+ Vector128 input = Unsafe.As>(ref pos).AsByte();
+ Vector128 in0g0g = Ssse3.Shuffle(input, SubtractGreenFromBlueAndRedMaskSsse3);
+ Vector128 output = Sse2.Subtract(input, in0g0g);
+ Unsafe.As>(ref pos) = output.AsUInt32();
+ }
- if (i != numPixels)
- {
- SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i));
- }
+ if (i != numPixels)
+ {
+ SubtractGreenFromBlueAndRedScalar(pixelData.Slice((int)i));
}
}
else if (Sse2.IsSupported)
{
int numPixels = pixelData.Length;
- fixed (uint* p = pixelData)
+ nint i;
+ for (i = 0; i <= numPixels - 4; i += 4)
{
- int i;
- for (i = 0; i + 4 <= numPixels; i += 4)
- {
- uint* idx = p + i;
- Vector128 input = Sse2.LoadVector128((ushort*)idx);
- Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
- Vector128 b = Sse2.ShuffleLow(a, SubtractGreenFromBlueAndRedShuffleMask);
- Vector128 c = Sse2.ShuffleHigh(b, SubtractGreenFromBlueAndRedShuffleMask); // 0g0g
- Vector128 output = Sse2.Subtract(input.AsByte(), c.AsByte());
- Sse2.Store((byte*)idx, output);
- }
+ ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
+ Vector128 input = Unsafe.As>(ref pos).AsByte();
+ Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
+ Vector128 b = Sse2.ShuffleLow(a, SubtractGreenFromBlueAndRedShuffleMask);
+ Vector128 c = Sse2.ShuffleHigh(b, SubtractGreenFromBlueAndRedShuffleMask); // 0g0g
+ Vector128 output = Sse2.Subtract(input.AsByte(), c.AsByte());
+ Unsafe.As>(ref pos) = output.AsUInt32();
+ }
- if (i != numPixels)
- {
- SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i));
- }
+ if (i != numPixels)
+ {
+ SubtractGreenFromBlueAndRedScalar(pixelData.Slice((int)i));
}
}
else
#endif
{
- SubtractGreenFromBlueAndRedNoneVectorized(pixelData);
+ SubtractGreenFromBlueAndRedScalar(pixelData);
}
}
- private static void SubtractGreenFromBlueAndRedNoneVectorized(Span pixelData)
+ private static void SubtractGreenFromBlueAndRedScalar(Span pixelData)
{
int numPixels = pixelData.Length;
for (int i = 0; i < numPixels; i++)
@@ -403,49 +391,74 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
/// Color transform keeps the green (G) value as it is, transforms red (R) based on green and transforms blue (B) based on green and then based on red.
///
/// The Vp8LMultipliers.
- /// The pixel data to transform.
+ /// The pixel data to transform.
/// The number of pixels to process.
- public static void TransformColor(Vp8LMultipliers m, Span data, int numPixels)
+ public static void TransformColor(Vp8LMultipliers m, Span pixelData, int numPixels)
{
#if SUPPORTS_RUNTIME_INTRINSICS
- if (Sse2.IsSupported)
+ if (Avx2.IsSupported && numPixels >= 8)
+ {
+ Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
+ Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0);
+
+ nint idx;
+ for (idx = 0; idx <= numPixels - 8; idx += 8)
+ {
+ ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
+ Vector256 input = Unsafe.As>(ref pos);
+ Vector256 a = Avx2.And(input.AsByte(), TransformColorAlphaGreenMask256);
+ Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask);
+ Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask);
+ Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
+ Vector256 e = Avx2.ShiftLeftLogical(input.AsInt16(), 8);
+ Vector256 f = Avx2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
+ Vector256 g = Avx2.ShiftRightLogical(f.AsInt32(), 16);
+ Vector256 h = Avx2.Add(g.AsByte(), d.AsByte());
+ Vector256 i = Avx2.And(h, TransformColorRedBlueMask256);
+ Vector256 output = Avx2.Subtract(input.AsByte(), i);
+ Unsafe.As>(ref pos) = output.AsUInt32();
+ }
+
+ if (idx != numPixels)
+ {
+ TransformColorScalar(m, pixelData.Slice((int)idx), numPixels - (int)idx);
+ }
+ }
+ else if (Sse2.IsSupported)
{
Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0);
- fixed (uint* src = data)
+ nint idx;
+ for (idx = 0; idx <= numPixels - 4; idx += 4)
{
- int idx;
- for (idx = 0; idx + 4 <= numPixels; idx += 4)
- {
- uint* pos = src + idx;
- Vector128 input = Sse2.LoadVector128(pos);
- Vector128 a = Sse2.And(input.AsByte(), TransformColorAlphaGreenMask);
- Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask);
- Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask);
- Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
- Vector128 e = Sse2.ShiftLeftLogical(input.AsInt16(), 8);
- Vector128 f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
- Vector128 g = Sse2.ShiftRightLogical(f.AsInt32(), 16);
- Vector128 h = Sse2.Add(g.AsByte(), d.AsByte());
- Vector128 i = Sse2.And(h, TransformColorRedBlueMask);
- Vector128 output = Sse2.Subtract(input.AsByte(), i);
- Sse2.Store((byte*)pos, output);
- }
+ ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
+ Vector128 input = Unsafe.As>(ref pos);
+ Vector128 a = Sse2.And(input.AsByte(), TransformColorAlphaGreenMask);
+ Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask);
+ Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask);
+ Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
+ Vector128 e = Sse2.ShiftLeftLogical(input.AsInt16(), 8);
+ Vector128 f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
+ Vector128 g = Sse2.ShiftRightLogical(f.AsInt32(), 16);
+ Vector128 h = Sse2.Add(g.AsByte(), d.AsByte());
+ Vector128 i = Sse2.And(h, TransformColorRedBlueMask);
+ Vector128 output = Sse2.Subtract(input.AsByte(), i);
+ Unsafe.As>(ref pos) = output.AsUInt32();
+ }
- if (idx != numPixels)
- {
- TransformColorNoneVectorized(m, data.Slice(idx), numPixels - idx);
- }
+ if (idx != numPixels)
+ {
+ TransformColorScalar(m, pixelData.Slice((int)idx), numPixels - (int)idx);
}
}
else
#endif
{
- TransformColorNoneVectorized(m, data, numPixels);
+ TransformColorScalar(m, pixelData, numPixels);
}
}
- private static void TransformColorNoneVectorized(Vp8LMultipliers m, Span data, int numPixels)
+ private static void TransformColorScalar(Vp8LMultipliers m, Span data, int numPixels)
{
for (int i = 0; i < numPixels; i++)
{
@@ -471,45 +484,71 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData)
{
#if SUPPORTS_RUNTIME_INTRINSICS
- if (Sse2.IsSupported)
+ if (Avx2.IsSupported && pixelData.Length >= 8)
+ {
+ Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
+ Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0);
+ nint idx;
+ for (idx = 0; idx <= pixelData.Length - 8; idx += 8)
+ {
+ ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
+ Vector256 input = Unsafe.As>(ref pos);
+ Vector256 a = Avx2.And(input.AsByte(), TransformColorInverseAlphaGreenMask256);
+ Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask);
+ Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask);
+ Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
+ Vector256 e = Avx2.Add(input.AsByte(), d.AsByte());
+ Vector256 f = Avx2.ShiftLeftLogical(e.AsInt16(), 8);
+ Vector256 g = Avx2.MultiplyHigh(f, multsb2.AsInt16());
+ Vector256 h = Avx2.ShiftRightLogical(g.AsInt32(), 8);
+ Vector256 i = Avx2.Add(h.AsByte(), f.AsByte());
+ Vector256 j = Avx2.ShiftRightLogical(i.AsInt16(), 8);
+ Vector256 output = Avx2.Or(j.AsByte(), a);
+ Unsafe.As>(ref pos) = output.AsUInt32();
+ }
+
+ if (idx != pixelData.Length)
+ {
+ TransformColorInverseScalar(m, pixelData.Slice((int)idx));
+ }
+ }
+ else if (Sse2.IsSupported)
{
Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0);
- fixed (uint* src = pixelData)
+
+ nint idx;
+ for (idx = 0; idx <= pixelData.Length - 4; idx += 4)
{
- int idx;
- for (idx = 0; idx + 4 <= pixelData.Length; idx += 4)
- {
- uint* pos = src + idx;
- Vector128 input = Sse2.LoadVector128(pos);
- Vector128 a = Sse2.And(input.AsByte(), TransformColorInverseAlphaGreenMask);
- Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask);
- Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask);
- Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
- Vector128 e = Sse2.Add(input.AsByte(), d.AsByte());
- Vector128 f = Sse2.ShiftLeftLogical(e.AsInt16(), 8);
- Vector128 g = Sse2.MultiplyHigh(f, multsb2.AsInt16());
- Vector128 h = Sse2.ShiftRightLogical(g.AsInt32(), 8);
- Vector128 i = Sse2.Add(h.AsByte(), f.AsByte());
- Vector128 j = Sse2.ShiftRightLogical(i.AsInt16(), 8);
- Vector128 output = Sse2.Or(j.AsByte(), a);
- Sse2.Store((byte*)pos, output);
- }
+ ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
+ Vector128 input = Unsafe.As>(ref pos);
+ Vector128 a = Sse2.And(input.AsByte(), TransformColorInverseAlphaGreenMask);
+ Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask);
+ Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask);
+ Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
+ Vector128 e = Sse2.Add(input.AsByte(), d.AsByte());
+ Vector128 f = Sse2.ShiftLeftLogical(e.AsInt16(), 8);
+ Vector128 g = Sse2.MultiplyHigh(f, multsb2.AsInt16());
+ Vector128 h = Sse2.ShiftRightLogical(g.AsInt32(), 8);
+ Vector128 i = Sse2.Add(h.AsByte(), f.AsByte());
+ Vector128 j = Sse2.ShiftRightLogical(i.AsInt16(), 8);
+ Vector128 output = Sse2.Or(j.AsByte(), a);
+ Unsafe.As>(ref pos) = output.AsUInt32();
+ }
- if (idx != pixelData.Length)
- {
- TransformColorInverseNoneVectorized(m, pixelData.Slice(idx));
- }
+ if (idx != pixelData.Length)
+ {
+ TransformColorInverseScalar(m, pixelData.Slice((int)idx));
}
}
else
#endif
{
- TransformColorInverseNoneVectorized(m, pixelData);
+ TransformColorInverseScalar(m, pixelData);
}
}
- private static void TransformColorInverseNoneVectorized(Vp8LMultipliers m, Span pixelData)
+ private static void TransformColorInverseScalar(Vp8LMultipliers m, Span pixelData)
{
for (int i = 0; i < pixelData.Length; i++)
{
@@ -744,6 +783,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
return (float)retVal;
}
+ [MethodImpl(InliningOptions.ShortMethod)]
public static byte TransformColorRed(sbyte greenToRed, uint argb)
{
sbyte green = U32ToS8(argb >> 8);
@@ -752,6 +792,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
return (byte)(newRed & 0xff);
}
+ [MethodImpl(InliningOptions.ShortMethod)]
public static byte TransformColorBlue(sbyte greenToBlue, sbyte redToBlue, uint argb)
{
sbyte green = U32ToS8(argb >> 8);
@@ -818,15 +859,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
int correction = (int)((23 * (origV & (y - 1))) >> 4);
return (vF * (WebpLookupTables.Log2Table[v] + logCnt)) + correction;
}
- else
- {
- return (float)(Log2Reciprocal * v * Math.Log(v));
- }
+
+ return (float)(Log2Reciprocal * v * Math.Log(v));
}
private static float FastLog2Slow(uint v)
{
Guard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v));
+
if (v < ApproxLogWithCorrectionMax)
{
int logCnt = 0;
@@ -1288,6 +1328,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
#if SUPPORTS_RUNTIME_INTRINSICS
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector128 MkCst16(int hi, int lo) => Vector128.Create((hi << 16) | (lo & 0xffff));
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static Vector256 MkCst32(int hi, int lo) => Vector256.Create((hi << 16) | (lo & 0xffff));
#endif
private static uint Select(uint a, uint b, uint c, Span scratch)
diff --git a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs
index 6cd109121d..96cdc3cbc5 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopy.cs
@@ -6,7 +6,7 @@ using System.Diagnostics;
namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
[DebuggerDisplay("Mode: {Mode}, Len: {Len}, BgraOrDistance: {BgraOrDistance}")]
- internal class PixOrCopy
+ internal sealed class PixOrCopy
{
public PixOrCopyMode Mode { get; set; }
diff --git a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopyMode.cs b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopyMode.cs
index 0d7023ffc2..26099b9023 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/PixOrCopyMode.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/PixOrCopyMode.cs
@@ -3,7 +3,7 @@
namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
- internal enum PixOrCopyMode
+ internal enum PixOrCopyMode : byte
{
Literal,
diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
index 99504dd488..a1e04c66a5 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
@@ -5,11 +5,6 @@ using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
-#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-#endif
-
namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
///
@@ -34,22 +29,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
private const int PredLowEffort = 11;
-#if SUPPORTS_RUNTIME_INTRINSICS
- private static readonly Vector128 CollectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte();
-
- private static readonly Vector128 CollectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte();
-
- private static readonly Vector128 CollectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
-
- private static readonly Vector128 CollectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-
- private static readonly Vector128 CollectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
-
- private static readonly Vector128 CollectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
-
- private static readonly Vector128 CollectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
-#endif
-
// This uses C#'s compiler optimization to refer to assembly's static data directly.
private static ReadOnlySpan DeltaLut => new sbyte[] { 16, 16, 8, 4, 2, 2, 2 };
@@ -572,19 +551,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
return (byte)lower;
}
- else
- {
- // upper is closer to residual than lower.
- if (residual <= boundaryResidual && upper > boundaryResidual)
- {
- // Halve quantization step to avoid crossing boundary. This midpoint is
- // on the same side of boundary as residual because midpoint <= residual
- // (since upper is closer than lower) and residual is below the boundary.
- return (byte)(lower + (quantization >> 1));
- }
- return (byte)(upper & 0xff);
+ // upper is closer to residual than lower.
+ if (residual <= boundaryResidual && upper > boundaryResidual)
+ {
+ // Halve quantization step to avoid crossing boundary. This midpoint is
+ // on the same side of boundary as residual because midpoint <= residual
+ // (since upper is closer than lower) and residual is below the boundary.
+ return (byte)(lower + (quantization >> 1));
}
+
+ return (byte)upper;
}
///
@@ -980,7 +957,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
Span histo = scratch.Slice(0, 256);
histo.Clear();
- CollectColorRedTransforms(argb, stride, tileWidth, tileHeight, greenToRed, histo);
+ ColorSpaceTransformUtils.CollectColorRedTransforms(argb, stride, tileWidth, tileHeight, greenToRed, histo);
double curDiff = PredictionCostCrossColor(accumulatedRedHisto, histo);
if ((byte)greenToRed == prevX.GreenToRed)
@@ -1018,7 +995,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
Span histo = scratch.Slice(0, 256);
histo.Clear();
- CollectColorBlueTransforms(argb, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
+ ColorSpaceTransformUtils.CollectColorBlueTransforms(argb, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
double curDiff = PredictionCostCrossColor(accumulatedBlueHisto, histo);
if ((byte)greenToBlue == prevX.GreenToBlue)
{
@@ -1057,146 +1034,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
return curDiff;
}
- private static void CollectColorRedTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- if (Sse41.IsSupported)
- {
- var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
- const int span = 8;
- Span values = stackalloc ushort[span];
- for (int y = 0; y < tileHeight; y++)
- {
- Span srcSpan = bgra.Slice(y * stride);
-#pragma warning disable SA1503 // Braces should not be omitted
- fixed (uint* src = srcSpan)
- fixed (ushort* dst = values)
- {
- for (int x = 0; x + span <= tileWidth; x += span)
- {
- uint* input0Idx = src + x;
- uint* input1Idx = src + x + (span / 2);
- Vector128 input0 = Sse2.LoadVector128((ushort*)input0Idx).AsByte();
- Vector128 input1 = Sse2.LoadVector128((ushort*)input1Idx).AsByte();
- Vector128 g0 = Sse2.And(input0, CollectColorRedTransformsGreenMask); // 0 0 | g 0
- Vector128 g1 = Sse2.And(input1, CollectColorRedTransformsGreenMask);
- Vector128 g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
- Vector128 a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0 | x r
- Vector128 a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16);
- Vector128 a = Sse41.PackUnsignedSaturate(a0, a1); // x r
- Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr
- Vector128 c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r'
- Vector128 d = Sse2.And(c, CollectColorRedTransformsAndMask); // 0 r'
- Sse2.Store(dst, d.AsUInt16());
- for (int i = 0; i < span; i++)
- {
- ++histo[values[i]];
- }
- }
- }
- }
-#pragma warning restore SA1503 // Braces should not be omitted
-
- int leftOver = tileWidth & (span - 1);
- if (leftOver > 0)
- {
- CollectColorRedTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToRed, histo);
- }
- }
- else
-#endif
- {
- CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
- }
- }
-
- private static void CollectColorRedTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span histo)
- {
- int pos = 0;
- while (tileHeight-- > 0)
- {
- for (int x = 0; x < tileWidth; x++)
- {
- int idx = LosslessUtils.TransformColorRed((sbyte)greenToRed, bgra[pos + x]);
- ++histo[idx];
- }
-
- pos += stride;
- }
- }
-
- private static void CollectColorBlueTransforms(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- if (Sse41.IsSupported)
- {
- const int span = 8;
- Span values = stackalloc ushort[span];
- var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
- var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
- for (int y = 0; y < tileHeight; y++)
- {
- Span srcSpan = bgra.Slice(y * stride);
-#pragma warning disable SA1503 // Braces should not be omitted
- fixed (uint* src = srcSpan)
- fixed (ushort* dst = values)
- {
- for (int x = 0; x + span <= tileWidth; x += span)
- {
- uint* input0Idx = src + x;
- uint* input1Idx = src + x + (span / 2);
- Vector128 input0 = Sse2.LoadVector128((ushort*)input0Idx).AsByte();
- Vector128 input1 = Sse2.LoadVector128((ushort*)input1Idx).AsByte();
- Vector128 r0 = Ssse3.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask);
- Vector128 r1 = Ssse3.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask);
- Vector128 r = Sse2.Or(r0, r1);
- Vector128 gb0 = Sse2.And(input0, CollectColorBlueTransformsGreenBlueMask);
- Vector128 gb1 = Sse2.And(input1, CollectColorBlueTransformsGreenBlueMask);
- Vector128 gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
- Vector128 g = Sse2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask);
- Vector128 a = Sse2.MultiplyHigh(r.AsInt16(), multsr);
- Vector128 b = Sse2.MultiplyHigh(g.AsInt16(), multsg);
- Vector128 c = Sse2.Subtract(gb.AsByte(), b.AsByte());
- Vector128 d = Sse2.Subtract(c, a.AsByte());
- Vector128 e = Sse2.And(d, CollectColorBlueTransformsBlueMask);
- Sse2.Store(dst, e.AsUInt16());
- for (int i = 0; i < span; i++)
- {
- ++histo[values[i]];
- }
- }
- }
- }
-#pragma warning restore SA1503 // Braces should not be omitted
-
- int leftOver = tileWidth & (span - 1);
- if (leftOver > 0)
- {
- CollectColorBlueTransformsNoneVectorized(bgra.Slice(tileWidth - leftOver), stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
- }
- }
- else
-#endif
- {
- CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
- }
- }
-
- private static void CollectColorBlueTransformsNoneVectorized(Span bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span histo)
- {
- int pos = 0;
- while (tileHeight-- > 0)
- {
- for (int x = 0; x < tileWidth; x++)
- {
- int idx = LosslessUtils.TransformColorBlue((sbyte)greenToBlue, (sbyte)redToBlue, bgra[pos + x]);
- ++histo[idx];
- }
-
- pos += stride;
- }
- }
-
private static float PredictionCostSpatialHistogram(int[][] accumulated, int[][] tile)
{
double retVal = 0.0d;
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LBackwardRefs.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LBackwardRefs.cs
index 502728b15f..fca4ec59f6 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LBackwardRefs.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LBackwardRefs.cs
@@ -7,7 +7,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
internal class Vp8LBackwardRefs
{
- public Vp8LBackwardRefs() => this.Refs = new List();
+ public Vp8LBackwardRefs(int pixels) => this.Refs = new List(pixels);
///
/// Gets or sets the common block-size.
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
index 4faa716495..ca2cfdc75e 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LEncoder.cs
@@ -124,19 +124,25 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
this.EncodedData = memoryAllocator.Allocate(pixelCount);
this.Palette = memoryAllocator.Allocate(WebpConstants.MaxPaletteSize);
this.Refs = new Vp8LBackwardRefs[3];
- this.HashChain = new Vp8LHashChain(pixelCount);
+ this.HashChain = new Vp8LHashChain(memoryAllocator, pixelCount);
// We round the block size up, so we're guaranteed to have at most MaxRefsBlockPerImage blocks used:
int refsBlockSize = ((pixelCount - 1) / MaxRefsBlockPerImage) + 1;
for (int i = 0; i < this.Refs.Length; i++)
{
- this.Refs[i] = new Vp8LBackwardRefs
+ this.Refs[i] = new Vp8LBackwardRefs(pixelCount)
{
BlockSize = refsBlockSize < MinBlockSize ? MinBlockSize : refsBlockSize
};
}
}
+ // RFC 1951 will calm you down if you are worried about this funny sequence.
+ // This sequence is tuned from that, but more weighted for lower symbol count,
+ // and more spiking histograms.
+ // This uses C#'s compiler optimization to refer to assembly's static data directly.
+ private static ReadOnlySpan StorageOrder => new byte[] { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
// This uses C#'s compiler optimization to refer to assembly's static data directly.
private static ReadOnlySpan Order => new byte[] { 1, 2, 0, 3 };
@@ -516,7 +522,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
// Calculate backward references from BGRA image.
- this.HashChain.Fill(this.memoryAllocator, bgra, this.quality, width, height, lowEffort);
+ this.HashChain.Fill(bgra, this.quality, width, height, lowEffort);
Vp8LBitWriter bitWriterBest = config.SubConfigs.Count > 1 ? this.bitWriter.Clone() : this.bitWriter;
Vp8LBitWriter bwInit = this.bitWriter;
@@ -530,6 +536,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
this.quality,
subConfig.Lz77,
ref cacheBits,
+ this.memoryAllocator,
this.HashChain,
this.Refs[0],
this.Refs[1]);
@@ -736,7 +743,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
// Calculate backward references from the image pixels.
- hashChain.Fill(this.memoryAllocator, bgra, quality, width, height, lowEffort);
+ hashChain.Fill(bgra, quality, width, height, lowEffort);
Vp8LBackwardRefs refs = BackwardReferenceEncoder.GetBackwardReferences(
width,
@@ -745,6 +752,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
quality,
(int)Vp8LLz77Type.Lz77Standard | (int)Vp8LLz77Type.Lz77Rle,
ref cacheBits,
+ this.memoryAllocator,
hashChain,
refsTmp1,
refsTmp2);
@@ -941,16 +949,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
private void StoreHuffmanTreeOfHuffmanTreeToBitMask(byte[] codeLengthBitDepth)
{
- // RFC 1951 will calm you down if you are worried about this funny sequence.
- // This sequence is tuned from that, but more weighted for lower symbol count,
- // and more spiking histograms.
- byte[] storageOrder = { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
// Throw away trailing zeros:
int codesToStore = WebpConstants.CodeLengthCodes;
for (; codesToStore > 4; codesToStore--)
{
- if (codeLengthBitDepth[storageOrder[codesToStore - 1]] != 0)
+ if (codeLengthBitDepth[StorageOrder[codesToStore - 1]] != 0)
{
break;
}
@@ -959,7 +962,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
this.bitWriter.PutBits((uint)codesToStore - 4, 4);
for (int i = 0; i < codesToStore; i++)
{
- this.bitWriter.PutBits(codeLengthBitDepth[storageOrder[i]], 3);
+ this.bitWriter.PutBits(codeLengthBitDepth[StorageOrder[i]], 3);
}
}
@@ -1803,6 +1806,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
this.BgraScratch.Dispose();
this.Palette.Dispose();
this.TransformData.Dispose();
+ this.HashChain.Dispose();
}
}
}
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs
index 977a094bd1..1bc7613a90 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHashChain.cs
@@ -8,7 +8,7 @@ using SixLabors.ImageSharp.Memory;
namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
- internal class Vp8LHashChain
+ internal sealed class Vp8LHashChain : IDisposable
{
private const uint HashMultiplierHi = 0xc6a4a793u;
@@ -28,14 +28,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
///
private const int WindowSize = (1 << WindowSizeBits) - 120;
+ private readonly MemoryAllocator memoryAllocator;
+
///
/// Initializes a new instance of the class.
///
+ /// The memory allocator.
/// The size off the chain.
- public Vp8LHashChain(int size)
+ public Vp8LHashChain(MemoryAllocator memoryAllocator, int size)
{
- this.OffsetLength = new uint[size];
- this.OffsetLength.AsSpan().Fill(0xcdcdcdcd);
+ this.memoryAllocator = memoryAllocator;
+ this.OffsetLength = this.memoryAllocator.Allocate(size, AllocationOptions.Clean);
this.Size = size;
}
@@ -45,16 +48,16 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
/// These 20 bits are the limit defined by GetWindowSizeForHashChain (through WindowSize = 1 << 20).
/// The lower 12 bits contain the length of the match.
///
- public uint[] OffsetLength { get; }
+ public IMemoryOwner OffsetLength { get; }
///
/// Gets the size of the hash chain.
- /// This is the maximum size of the hash_chain that can be constructed.
+ /// This is the maximum size of the hashchain that can be constructed.
/// Typically this is the pixel count (width x height) for a given image.
///
public int Size { get; }
- public void Fill(MemoryAllocator memoryAllocator, ReadOnlySpan bgra, int quality, int xSize, int ySize, bool lowEffort)
+ public void Fill(ReadOnlySpan bgra, int quality, int xSize, int ySize, bool lowEffort)
{
int size = xSize * ySize;
int iterMax = GetMaxItersForQuality(quality);
@@ -63,20 +66,21 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
if (size <= 2)
{
- this.OffsetLength[0] = 0;
+ this.OffsetLength.GetSpan()[0] = 0;
return;
}
- using IMemoryOwner hashToFirstIndexBuffer = memoryAllocator.Allocate(HashSize);
+ using IMemoryOwner hashToFirstIndexBuffer = this.memoryAllocator.Allocate(HashSize);
+ using IMemoryOwner chainBuffer = this.memoryAllocator.Allocate(size, AllocationOptions.Clean);
Span hashToFirstIndex = hashToFirstIndexBuffer.GetSpan();
+ Span chain = chainBuffer.GetSpan();
// Initialize hashToFirstIndex array to -1.
hashToFirstIndex.Fill(-1);
- int[] chain = new int[size];
-
// Fill the chain linking pixels with the same hash.
bool bgraComp = bgra.Length > 1 && bgra[0] == bgra[1];
+ Span tmp = stackalloc uint[2];
for (pos = 0; pos < size - 2;)
{
uint hashCode;
@@ -85,7 +89,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
// Consecutive pixels with the same color will share the same hash.
// We therefore use a different hash: the color and its repetition length.
- uint[] tmp = new uint[2];
+ tmp.Clear();
uint len = 1;
tmp[0] = bgra[pos];
@@ -134,7 +138,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
// Find the best match interval at each pixel, defined by an offset to the
// pixel and a length. The right-most pixel cannot match anything to the right
// (hence a best length of 0) and the left-most pixel nothing to the left (hence an offset of 0).
- this.OffsetLength[0] = this.OffsetLength[size - 1] = 0;
+ Span offsetLength = this.OffsetLength.GetSpan();
+ offsetLength[0] = offsetLength[size - 1] = 0;
for (int basePosition = size - 2; basePosition > 0;)
{
int maxLen = LosslessUtils.MaxFindCopyLength(size - 1 - basePosition);
@@ -208,7 +213,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
uint maxBasePosition = (uint)basePosition;
while (true)
{
- this.OffsetLength[basePosition] = (bestDistance << BackwardReferenceEncoder.MaxLengthBits) | (uint)bestLength;
+ offsetLength[basePosition] = (bestDistance << BackwardReferenceEncoder.MaxLengthBits) | (uint)bestLength;
--basePosition;
// Stop if we don't have a match or if we are out of bounds.
@@ -242,10 +247,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
[MethodImpl(InliningOptions.ShortMethod)]
- public int FindLength(int basePosition) => (int)(this.OffsetLength[basePosition] & ((1U << BackwardReferenceEncoder.MaxLengthBits) - 1));
+ public int FindLength(int basePosition) => (int)(this.OffsetLength.GetSpan()[basePosition] & ((1U << BackwardReferenceEncoder.MaxLengthBits) - 1));
[MethodImpl(InliningOptions.ShortMethod)]
- public int FindOffset(int basePosition) => (int)(this.OffsetLength[basePosition] >> BackwardReferenceEncoder.MaxLengthBits);
+ public int FindOffset(int basePosition) => (int)(this.OffsetLength.GetSpan()[basePosition] >> BackwardReferenceEncoder.MaxLengthBits);
///
/// Calculates the hash for a pixel pair.
@@ -280,5 +285,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
return maxWindowSize > WindowSize ? WindowSize : maxWindowSize;
}
+
+ ///
+ public void Dispose() => this.OffsetLength.Dispose();
}
}
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs
index 8b02015687..bdb53f5c6a 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs
@@ -320,7 +320,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
else
{
- output.Literal.AsSpan(0, literalSize).Fill(0);
+ output.Literal.AsSpan(0, literalSize).Clear();
}
}
@@ -343,7 +343,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
else
{
- output.Red.AsSpan(0, size).Fill(0);
+ output.Red.AsSpan(0, size).Clear();
}
}
@@ -366,7 +366,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
else
{
- output.Blue.AsSpan(0, size).Fill(0);
+ output.Blue.AsSpan(0, size).Clear();
}
}
@@ -389,7 +389,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
else
{
- output.Alpha.AsSpan(0, size).Fill(0);
+ output.Alpha.AsSpan(0, size).Clear();
}
}
@@ -412,7 +412,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
else
{
- output.Distance.AsSpan(0, size).Fill(0);
+ output.Distance.AsSpan(0, size).Clear();
}
}
diff --git a/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs b/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs
index cafccd0982..f517ad520f 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/WebpLosslessDecoder.cs
@@ -65,15 +65,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
FixedTableSize + 2704
};
- private static readonly byte[] CodeLengthCodeOrder = { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
private static readonly int NumCodeLengthCodes = CodeLengthCodeOrder.Length;
- private static readonly byte[] LiteralMap =
- {
- 0, 1, 1, 1, 0
- };
-
///
/// Initializes a new instance of the class.
///
@@ -87,6 +80,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
this.configuration = configuration;
}
+ // This uses C#'s compiler optimization to refer to assembly's static data directly.
+ private static ReadOnlySpan CodeLengthCodeOrder => new byte[] { 17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+ // This uses C#'s compiler optimization to refer to assembly's static data directly.
+ private static ReadOnlySpan LiteralMap => new byte[] { 0, 1, 1, 1, 0 };
+
///
/// Decodes the image from the stream using the bitreader.
///
@@ -834,10 +833,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
private void BuildPackedTable(HTreeGroup hTreeGroup)
{
- for (uint code = 0; code < HuffmanUtils.HuffmanPackedTableSize; ++code)
+ for (uint code = 0; code < HuffmanUtils.HuffmanPackedTableSize; code++)
{
uint bits = code;
- HuffmanCode huff = hTreeGroup.PackedTable[bits];
+ ref HuffmanCode huff = ref hTreeGroup.PackedTable[bits];
HuffmanCode hCode = hTreeGroup.HTrees[HuffIndex.Green][bits];
if (hCode.Value >= WebpConstants.NumLiteralCodes)
{
@@ -848,10 +847,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
huff.BitsUsed = 0;
huff.Value = 0;
- bits >>= AccumulateHCode(hCode, 8, huff);
- bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Red][bits], 16, huff);
- bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Blue][bits], 0, huff);
- bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Alpha][bits], 24, huff);
+ bits >>= AccumulateHCode(hCode, 8, ref huff);
+ bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Red][bits], 16, ref huff);
+ bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Blue][bits], 0, ref huff);
+ bits >>= AccumulateHCode(hTreeGroup.HTrees[HuffIndex.Alpha][bits], 24, ref huff);
}
}
}
@@ -992,7 +991,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
}
[MethodImpl(InliningOptions.ShortMethod)]
- private static int AccumulateHCode(HuffmanCode hCode, int shift, HuffmanCode huff)
+ private static int AccumulateHCode(HuffmanCode hCode, int shift, ref HuffmanCode huff)
{
huff.BitsUsed += hCode.BitsUsed;
huff.Value |= hCode.Value << shift;
diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index a10ec6eabb..b8986f66ff 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -704,28 +704,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Transpose the two 4x4.
- Vector128 transpose00 = Sse2.UnpackLow(b0, b1);
- Vector128 transpose01 = Sse2.UnpackLow(b2, b3);
- Vector128 transpose02 = Sse2.UnpackHigh(b0, b1);
- Vector128 transpose03 = Sse2.UnpackHigh(b2, b3);
-
- // a00 a10 a01 a11 a02 a12 a03 a13
- // a20 a30 a21 a31 a22 a32 a23 a33
- // b00 b10 b01 b11 b02 b12 b03 b13
- // b20 b30 b21 b31 b22 b32 b23 b33
- Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
- Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
- Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
- Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
-
- // a00 a10 a20 a30 a01 a11 a21 a31
- // b00 b10 b20 b30 b01 b11 b21 b31
- // a02 a12 a22 a32 a03 a13 a23 a33
- // b02 b12 a22 b32 b03 b13 b23 b33
- Vector128 output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
- Vector128 output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
- Vector128 output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
- Vector128 output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
+ Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
@@ -769,6 +748,44 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
return Numerics.ReduceSum(result);
}
+
+ // Transpose two 4x4 16b matrices horizontally stored in registers.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3)
+ {
+ // Transpose the two 4x4.
+ // a00 a01 a02 a03 b00 b01 b02 b03
+ // a10 a11 a12 a13 b10 b11 b12 b13
+ // a20 a21 a22 a23 b20 b21 b22 b23
+ // a30 a31 a32 a33 b30 b31 b32 b33
+ Vector128 transpose00 = Sse2.UnpackLow(b0, b1);
+ Vector128 transpose01 = Sse2.UnpackLow(b2, b3);
+ Vector128 transpose02 = Sse2.UnpackHigh(b0, b1);
+ Vector128 transpose03 = Sse2.UnpackHigh(b2, b3);
+
+ // a00 a10 a01 a11 a02 a12 a03 a13
+ // a20 a30 a21 a31 a22 a32 a23 a33
+ // b00 b10 b01 b11 b02 b12 b03 b13
+ // b20 b30 b21 b31 b22 b32 b23 b33
+ Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
+ Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
+ Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
+ Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
+
+ // a00 a10 a20 a30 a01 a11 a21 a31
+ // b00 b10 b20 b30 b01 b11 b21 b31
+ // a02 a12 a22 a32 a03 a13 a23 a33
+ // b02 b12 a22 b32 b03 b13 b23 b33
+ output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
+ output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
+ output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
+ output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
+
+ // a00 a10 a20 a30 b00 b10 b20 b30
+ // a01 a11 a21 a31 b01 b11 b21 b31
+ // a02 a12 a22 a32 b02 b12 b22 b32
+ // a03 a13 a23 a33 b03 b13 b23 b33
+ }
#endif
public static void TransformTwo(Span src, Span dst, Span scratch)
diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 38ed80590d..2fcea8ceea 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -329,7 +329,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
LossyUtils.TransformWht(dcTmp, tmp, scratch);
for (n = 0; n < 16; n += 2)
{
- Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true, scratch);
+ Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), scratch);
}
return nz;
@@ -342,7 +342,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Span scratch = it.Scratch3.AsSpan(0, 16);
Vp8Encoding.FTransform(src, reference, tmp, scratch);
int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
- Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
+ Vp8Encoding.ITransformOne(reference, tmp, yuvOut, scratch);
return nz;
}
@@ -375,7 +375,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
for (n = 0; n < 8; n += 2)
{
- Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true, scratch);
+ Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), scratch);
}
return nz << 16;
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
index 6279aef656..fcd61f2c0e 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
@@ -911,7 +911,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
this.LeftNz[8] = 0;
- this.LeftDerr.AsSpan().Fill(0);
+ this.LeftDerr.AsSpan().Clear();
}
private void InitTop()
@@ -919,14 +919,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
int topSize = this.mbw * 16;
this.YTop.AsSpan(0, topSize).Fill(127);
this.UvTop.AsSpan().Fill(127);
- this.Nz.AsSpan().Fill(0);
+ this.Nz.AsSpan().Clear();
int predsW = (4 * this.mbw) + 1;
int predsH = (4 * this.mbh) + 1;
int predsSize = predsW * predsH;
- this.Preds.AsSpan(predsSize + this.predsWidth, this.mbw).Fill(0);
+ this.Preds.AsSpan(predsSize + this.predsWidth, this.mbw).Clear();
- this.TopDerr.AsSpan().Fill(0);
+ this.TopDerr.AsSpan().Clear();
}
private int Bit(uint nz, int n) => (nz & (1 << n)) != 0 ? 1 : 0;
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
index 8a4115d216..37e09d0802 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoder.cs
@@ -546,7 +546,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
int predsW = (4 * this.Mbw) + 1;
int predsH = (4 * this.Mbh) + 1;
int predsSize = predsW * predsH;
- this.Preds.AsSpan(predsSize + this.PredsWidth - 4, 4).Fill(0);
+ this.Preds.AsSpan(predsSize + this.PredsWidth - 4, 4).Clear();
this.Nz[0] = 0; // constant
}
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index af7e8eaa36..aa4ab5767b 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -4,6 +4,11 @@
using System;
using System.Buffers.Binary;
using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
@@ -60,6 +65,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 };
+#if SUPPORTS_RUNTIME_INTRINSICS
+ public static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16();
+
+ public static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16();
+
+ public static readonly Vector128 Four = Vector128.Create((short)4);
+#endif
+
static Vp8Encoding()
{
for (int i = -255; i <= 255 + 255; i++)
@@ -68,51 +81,299 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
}
}
- public static void ITransform(Span reference, Span input, Span dst, bool doTwo, Span scratch)
+ // Transforms (Paragraph 14.4)
+ // Does two inverse transforms.
+ public static void ITransform(Span reference, Span input, Span dst, Span scratch)
{
- ITransformOne(reference, input, dst, scratch);
- if (doTwo)
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Sse2.IsSupported)
+ {
+ // This implementation makes use of 16-bit fixed point versions of two
+ // multiply constants:
+ // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+ // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+ //
+ // To be able to use signed 16-bit integers, we use the following trick to
+ // have constants within range:
+ // - Associated constants are obtained by subtracting the 16-bit fixed point
+ // version of one:
+ // k = K - (1 << 16) => K = k + (1 << 16)
+ // K1 = 85267 => k1 = 20091
+ // K2 = 35468 => k2 = -30068
+ // - The multiplication of a variable by a constant become the sum of the
+ // variable and the multiplication of that variable by the associated
+ // constant:
+ // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+
+ // Load and concatenate the transform coefficients (we'll do two inverse
+ // transforms in parallel). In the case of only one inverse transform, the
+ // second half of the vectors will just contain random value we'll never
+ // use nor store.
+ ref short inputRef = ref MemoryMarshal.GetReference(input);
+ var in0 = Vector128.Create(Unsafe.As