diff --git a/src/ImageSharp/Compression/Zlib/Adler32.cs b/src/ImageSharp/Compression/Zlib/Adler32.cs index 7eb3f4516f..1f3b7e2a23 100644 --- a/src/ImageSharp/Compression/Zlib/Adler32.cs +++ b/src/ImageSharp/Compression/Zlib/Adler32.cs @@ -31,6 +31,8 @@ namespace SixLabors.ImageSharp.Compression.Zlib #if SUPPORTS_RUNTIME_INTRINSICS private const int MinBufferSize = 64; + private const int BLOCK_SIZE = 1 << 5; + // The C# compiler emits this as a compile-time constant embedded in the PE file. private static ReadOnlySpan Tap1Tap2 => new byte[] { @@ -63,6 +65,11 @@ namespace SixLabors.ImageSharp.Compression.Zlib } #if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported && buffer.Length >= MinBufferSize) + { + return CalculateAvx2(adler, buffer); + } + if (Ssse3.IsSupported && buffer.Length >= MinBufferSize) { return CalculateSse(adler, buffer); @@ -83,8 +90,6 @@ namespace SixLabors.ImageSharp.Compression.Zlib uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. - const int BLOCK_SIZE = 1 << 5; - uint length = (uint)buffer.Length; uint blocks = length / BLOCK_SIZE; length -= blocks * BLOCK_SIZE; @@ -164,45 +169,127 @@ namespace SixLabors.ImageSharp.Compression.Zlib if (length > 0) { - if (length >= 16) - { - s2 += s1 += localBufferPtr[0]; - s2 += s1 += localBufferPtr[1]; - s2 += s1 += localBufferPtr[2]; - s2 += s1 += localBufferPtr[3]; - s2 += s1 += localBufferPtr[4]; - s2 += s1 += localBufferPtr[5]; - s2 += s1 += localBufferPtr[6]; - s2 += s1 += localBufferPtr[7]; - s2 += s1 += localBufferPtr[8]; - s2 += s1 += localBufferPtr[9]; - s2 += s1 += localBufferPtr[10]; - s2 += s1 += localBufferPtr[11]; - s2 += s1 += localBufferPtr[12]; - s2 += s1 += localBufferPtr[13]; - s2 += s1 += localBufferPtr[14]; - s2 += s1 += localBufferPtr[15]; - - localBufferPtr += 16; - length -= 16; - } + HandleLeftOver(localBufferPtr, length, ref s1, ref s2); + } - while (length-- > 0) - { - s2 += s1 += *localBufferPtr++; - } + return s1 | (s2 << 16); + } + } + } - if (s1 >= BASE) - { - s1 -= BASE; - } + // Based on: https://github.com/zlib-ng/zlib-ng/blob/develop/arch/x86/adler32_avx2.c + [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] + public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan buffer) + { + uint s1 = adler & 0xFFFF; + uint s2 = (adler >> 16) & 0xFFFF; + uint length = (uint)buffer.Length; - s2 %= BASE; + fixed (byte* bufferPtr = buffer) + { + byte* localBufferPtr = bufferPtr; + + Vector256 zero = Vector256.Zero; + var dot3v = Vector256.Create((short)1); + var dot2v = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + + // Process n blocks of data. At most NMAX data bytes can be + // processed before s2 must be reduced modulo BASE. + var vs1 = Vector256.CreateScalar(s1); + var vs2 = Vector256.CreateScalar(s2); + + while (length >= 32) + { + int k = length < NMAX ? (int)length : (int)NMAX; + k -= k % 32; + length -= (uint)k; + + Vector256 vs10 = vs1; + Vector256 vs3 = Vector256.Zero; + + while (k >= 32) + { + // Load 32 input bytes. + Vector256 block = Avx.LoadVector256(localBufferPtr); + + // Sum of abs diff, resulting in 2 x int32's + Vector256 vs1sad = Avx2.SumAbsoluteDifferences(block, zero); + + vs1 = Avx2.Add(vs1, vs1sad.AsUInt32()); + vs3 = Avx2.Add(vs3, vs10); + + // sum 32 uint8s to 16 shorts. + Vector256 vshortsum2 = Avx2.MultiplyAddAdjacent(block, dot2v); + + // sum 16 shorts to 8 uint32s. + Vector256 vsum2 = Avx2.MultiplyAddAdjacent(vshortsum2, dot3v); + + vs2 = Avx2.Add(vsum2.AsUInt32(), vs2); + vs10 = vs1; + + localBufferPtr += BLOCK_SIZE; + k -= 32; } - return s1 | (s2 << 16); + // Defer the multiplication with 32 to outside of the loop. + vs3 = Avx2.ShiftLeftLogical(vs3, 5); + vs2 = Avx2.Add(vs2, vs3); + + s1 = (uint)Numerics.EvenReduceSum(vs1.AsInt32()); + s2 = (uint)Numerics.ReduceSum(vs2.AsInt32()); + + s1 %= BASE; + s2 %= BASE; + + vs1 = Vector256.CreateScalar(s1); + vs2 = Vector256.CreateScalar(s2); } + + if (length > 0) + { + HandleLeftOver(localBufferPtr, length, ref s1, ref s2); + } + + return s1 | (s2 << 16); + } + } + + private static unsafe void HandleLeftOver(byte* localBufferPtr, uint length, ref uint s1, ref uint s2) + { + if (length >= 16) + { + s2 += s1 += localBufferPtr[0]; + s2 += s1 += localBufferPtr[1]; + s2 += s1 += localBufferPtr[2]; + s2 += s1 += localBufferPtr[3]; + s2 += s1 += localBufferPtr[4]; + s2 += s1 += localBufferPtr[5]; + s2 += s1 += localBufferPtr[6]; + s2 += s1 += localBufferPtr[7]; + s2 += s1 += localBufferPtr[8]; + s2 += s1 += localBufferPtr[9]; + s2 += s1 += localBufferPtr[10]; + s2 += s1 += localBufferPtr[11]; + s2 += s1 += localBufferPtr[12]; + s2 += s1 += localBufferPtr[13]; + s2 += s1 += localBufferPtr[14]; + s2 += s1 += localBufferPtr[15]; + + localBufferPtr += 16; + length -= 16; } + + while (length-- > 0) + { + s2 += s1 += *localBufferPtr++; + } + + if (s1 >= BASE) + { + s1 -= BASE; + } + + s2 %= BASE; } #endif