From bab85d4372ee7cc784acc7d743ffd2c6886ea460 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 21 Nov 2021 22:17:12 +0100 Subject: [PATCH] Add SSE version of CombinedShannonEntropy --- .../Formats/Webp/Lossless/LosslessUtils.cs | 154 ++++++++++++++++-- 1 file changed, 137 insertions(+), 17 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 471c083cd..52453c77f 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -759,28 +759,147 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless /// Shanon entropy. public static float CombinedShannonEntropy(Span x, Span y) { - double retVal = 0.0d; - uint sumX = 0, sumXY = 0; - for (int i = 0; i < 256; i++) - { - uint xi = (uint)x[i]; - if (xi != 0) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse41.IsSupported) + { + double retVal = 0.0d; + Span tmp = stackalloc int[4]; + ref int xRef = ref MemoryMarshal.GetReference(x); + ref int yRef = ref MemoryMarshal.GetReference(y); + Vector128 sumXY128 = Vector128.Zero; + Vector128 sumX128 = Vector128.Zero; + ref int tmpRef = ref MemoryMarshal.GetReference(tmp); + for (int i = 0; i < 256; i += 4) { - uint xy = xi + (uint)y[i]; - sumX += xi; - retVal -= FastSLog2(xi); - sumXY += xy; - retVal -= FastSLog2(xy); + Vector128 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i)); + Vector128 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i)); + + // Check if any X is non-zero: this actually provides a speedup as X is usually sparse. + if (Sse2.MoveMask(Sse2.CompareEqual(xVec, Vector128.Zero).AsByte()) != 0xFFFF) + { + Vector128 xy128 = Sse2.Add(xVec, yVec); + sumXY128 = Sse2.Add(sumXY128, xy128); + sumX128 = Sse2.Add(sumX128, xVec); + + // Analyze the different X + Y. + Unsafe.As>(ref tmpRef) = xy128; + if (tmp[0] != 0) + { + retVal -= FastSLog2((uint)tmp[0]); + if (x[i + 0] != 0) + { + retVal -= FastSLog2((uint)x[i + 0]); + } + } + + if (tmp[1] != 0) + { + retVal -= FastSLog2((uint)tmp[1]); + if (x[i + 1] != 0) + { + retVal -= FastSLog2((uint)x[i + 1]); + } + } + + if (tmp[2] != 0) + { + retVal -= FastSLog2((uint)tmp[2]); + if (x[i + 2] != 0) + { + retVal -= FastSLog2((uint)x[i + 2]); + } + } + + if (tmp[3] != 0) + { + retVal -= FastSLog2((uint)tmp[3]); + if (x[i + 3] != 0) + { + retVal -= FastSLog2((uint)x[i + 3]); + } + } + } + else + { + // X is fully 0, so only deal with Y. + sumXY128 = Sse2.Add(sumXY128, yVec); + + if (y[i] != 0) + { + retVal -= FastSLog2((uint)y[i]); + } + + if (y[i + 1] != 0) + { + retVal -= FastSLog2((uint)y[i + 1]); + } + + if (y[i + 2] != 0) + { + retVal -= FastSLog2((uint)y[i + 2]); + } + + if (y[i + 3] != 0) + { + retVal -= FastSLog2((uint)y[i + 3]); + } + } } - else if (y[i] != 0) + + // Sum up sumX_128 to get sumX and sum up sumXY_128 to get sumXY. + // note: not using here Numerics.ReduceSum, because grouping the same methods together should be slightly faster. + Vector128 haddSumX = Ssse3.HorizontalAdd(sumX128, sumX128); + Vector128 haddSumXY = Ssse3.HorizontalAdd(sumXY128, sumXY128); + Vector128 swappedSumX = Sse2.Shuffle(haddSumX, 0x1); + Vector128 swappedSumXY = Sse2.Shuffle(haddSumXY, 0x1); + Vector128 tmpSumX = Sse2.Add(haddSumX, swappedSumX); + Vector128 tmpSumXY = Sse2.Add(haddSumXY, swappedSumXY); + int sumX = Sse2.ConvertToInt32(tmpSumX); + int sumXY = Sse2.ConvertToInt32(tmpSumXY); + + retVal += FastSLog2((uint)sumX) + FastSLog2((uint)sumXY); + + return (float)retVal; + } + else +#endif + { + double retVal = 0.0d; + uint sumX = 0, sumXY = 0; + for (int i = 0; i < 256; i++) { - sumXY += (uint)y[i]; - retVal -= FastSLog2((uint)y[i]); + uint xi = (uint)x[i]; + if (xi != 0) + { + uint xy = xi + (uint)y[i]; + sumX += xi; + retVal -= FastSLog2(xi); + sumXY += xy; + retVal -= FastSLog2(xy); + } + else if (y[i] != 0) + { + sumXY += (uint)y[i]; + retVal -= FastSLog2((uint)y[i]); + } } + + retVal += FastSLog2(sumX) + FastSLog2(sumXY); + return (float)retVal; } + } - retVal += FastSLog2(sumX) + FastSLog2(sumXY); - return (float)retVal; + [MethodImpl(InliningOptions.ShortMethod)] + private static void AnalyzeXy(Span tmp, Span x, int i, int pos, ref double retVal) + { + if (tmp[pos] != 0) + { + retVal -= FastSLog2((uint)tmp[pos]); + if (x[i + pos] != 0) + { + retVal -= FastSLog2((uint)x[i + pos]); + } + } } [MethodImpl(InliningOptions.ShortMethod)] @@ -836,6 +955,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static float FastSLog2Slow(uint v) { DebugGuard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v)); + if (v < ApproxLogWithCorrectionMax) { int logCnt = 0; @@ -865,7 +985,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless private static float FastLog2Slow(uint v) { - Guard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v)); + DebugGuard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v)); if (v < ApproxLogWithCorrectionMax) {