Browse Source

Merge pull request #2024 from SixLabors/bp/adleravx

Add AVX2 version of adler32
pull/2034/head
Brian Popow 4 years ago
committed by GitHub
parent
commit
47a759e799
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 172
      src/ImageSharp/Compression/Zlib/Adler32.cs
  2. 39
      tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs

172
src/ImageSharp/Compression/Zlib/Adler32.cs

@ -3,6 +3,7 @@
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
@ -31,6 +32,8 @@ namespace SixLabors.ImageSharp.Compression.Zlib
#if SUPPORTS_RUNTIME_INTRINSICS
private const int MinBufferSize = 64;
private const int BlockSize = 1 << 5;
// The C# compiler emits this as a compile-time constant embedded in the PE file.
private static ReadOnlySpan<byte> Tap1Tap2 => new byte[]
{
@ -63,6 +66,11 @@ namespace SixLabors.ImageSharp.Compression.Zlib
}
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported && buffer.Length >= MinBufferSize)
{
return CalculateAvx2(adler, buffer);
}
if (Ssse3.IsSupported && buffer.Length >= MinBufferSize)
{
return CalculateSse(adler, buffer);
@ -83,19 +91,15 @@ namespace SixLabors.ImageSharp.Compression.Zlib
uint s2 = (adler >> 16) & 0xFFFF;
// Process the data in blocks.
const int BLOCK_SIZE = 1 << 5;
uint length = (uint)buffer.Length;
uint blocks = length / BLOCK_SIZE;
length -= blocks * BLOCK_SIZE;
uint blocks = length / BlockSize;
length -= blocks * BlockSize;
int index = 0;
fixed (byte* bufferPtr = buffer)
fixed (byte* bufferPtr = &MemoryMarshal.GetReference(buffer))
{
fixed (byte* tapPtr = Tap1Tap2)
fixed (byte* tapPtr = &MemoryMarshal.GetReference(Tap1Tap2))
{
index += (int)blocks * BLOCK_SIZE;
var localBufferPtr = bufferPtr;
byte* localBufferPtr = bufferPtr;
// _mm_setr_epi8 on x86
Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
@ -105,7 +109,7 @@ namespace SixLabors.ImageSharp.Compression.Zlib
while (blocks > 0)
{
uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
uint n = NMAX / BlockSize; /* The NMAX constraint. */
if (n > blocks)
{
n = blocks;
@ -138,7 +142,7 @@ namespace SixLabors.ImageSharp.Compression.Zlib
Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());
localBufferPtr += BLOCK_SIZE;
localBufferPtr += BlockSize;
}
while (--n > 0);
@ -164,45 +168,127 @@ namespace SixLabors.ImageSharp.Compression.Zlib
if (length > 0)
{
if (length >= 16)
{
s2 += s1 += localBufferPtr[0];
s2 += s1 += localBufferPtr[1];
s2 += s1 += localBufferPtr[2];
s2 += s1 += localBufferPtr[3];
s2 += s1 += localBufferPtr[4];
s2 += s1 += localBufferPtr[5];
s2 += s1 += localBufferPtr[6];
s2 += s1 += localBufferPtr[7];
s2 += s1 += localBufferPtr[8];
s2 += s1 += localBufferPtr[9];
s2 += s1 += localBufferPtr[10];
s2 += s1 += localBufferPtr[11];
s2 += s1 += localBufferPtr[12];
s2 += s1 += localBufferPtr[13];
s2 += s1 += localBufferPtr[14];
s2 += s1 += localBufferPtr[15];
localBufferPtr += 16;
length -= 16;
}
HandleLeftOver(localBufferPtr, length, ref s1, ref s2);
}
while (length-- > 0)
{
s2 += s1 += *localBufferPtr++;
}
return s1 | (s2 << 16);
}
}
}
if (s1 >= BASE)
{
s1 -= BASE;
}
// Based on: https://github.com/zlib-ng/zlib-ng/blob/develop/arch/x86/adler32_avx2.c
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
{
uint s1 = adler & 0xFFFF;
uint s2 = (adler >> 16) & 0xFFFF;
uint length = (uint)buffer.Length;
s2 %= BASE;
fixed (byte* bufferPtr = &MemoryMarshal.GetReference(buffer))
{
byte* localBufferPtr = bufferPtr;
Vector256<byte> zero = Vector256<byte>.Zero;
var dot3v = Vector256.Create((short)1);
var dot2v = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
// Process n blocks of data. At most NMAX data bytes can be
// processed before s2 must be reduced modulo BASE.
var vs1 = Vector256.CreateScalar(s1);
var vs2 = Vector256.CreateScalar(s2);
while (length >= 32)
{
int k = length < NMAX ? (int)length : (int)NMAX;
k -= k % 32;
length -= (uint)k;
Vector256<uint> vs10 = vs1;
Vector256<uint> vs3 = Vector256<uint>.Zero;
while (k >= 32)
{
// Load 32 input bytes.
Vector256<byte> block = Avx.LoadVector256(localBufferPtr);
// Sum of abs diff, resulting in 2 x int32's
Vector256<ushort> vs1sad = Avx2.SumAbsoluteDifferences(block, zero);
vs1 = Avx2.Add(vs1, vs1sad.AsUInt32());
vs3 = Avx2.Add(vs3, vs10);
// sum 32 uint8s to 16 shorts.
Vector256<short> vshortsum2 = Avx2.MultiplyAddAdjacent(block, dot2v);
// sum 16 shorts to 8 uint32s.
Vector256<int> vsum2 = Avx2.MultiplyAddAdjacent(vshortsum2, dot3v);
vs2 = Avx2.Add(vsum2.AsUInt32(), vs2);
vs10 = vs1;
localBufferPtr += BlockSize;
k -= 32;
}
return s1 | (s2 << 16);
// Defer the multiplication with 32 to outside of the loop.
vs3 = Avx2.ShiftLeftLogical(vs3, 5);
vs2 = Avx2.Add(vs2, vs3);
s1 = (uint)Numerics.EvenReduceSum(vs1.AsInt32());
s2 = (uint)Numerics.ReduceSum(vs2.AsInt32());
s1 %= BASE;
s2 %= BASE;
vs1 = Vector256.CreateScalar(s1);
vs2 = Vector256.CreateScalar(s2);
}
if (length > 0)
{
HandleLeftOver(localBufferPtr, length, ref s1, ref s2);
}
return s1 | (s2 << 16);
}
}
private static unsafe void HandleLeftOver(byte* localBufferPtr, uint length, ref uint s1, ref uint s2)
{
if (length >= 16)
{
s2 += s1 += localBufferPtr[0];
s2 += s1 += localBufferPtr[1];
s2 += s1 += localBufferPtr[2];
s2 += s1 += localBufferPtr[3];
s2 += s1 += localBufferPtr[4];
s2 += s1 += localBufferPtr[5];
s2 += s1 += localBufferPtr[6];
s2 += s1 += localBufferPtr[7];
s2 += s1 += localBufferPtr[8];
s2 += s1 += localBufferPtr[9];
s2 += s1 += localBufferPtr[10];
s2 += s1 += localBufferPtr[11];
s2 += s1 += localBufferPtr[12];
s2 += s1 += localBufferPtr[13];
s2 += s1 += localBufferPtr[14];
s2 += s1 += localBufferPtr[15];
localBufferPtr += 16;
length -= 16;
}
while (length-- > 0)
{
s2 += s1 += *localBufferPtr++;
}
if (s1 >= BASE)
{
s1 -= BASE;
}
s2 %= BASE;
}
#endif

39
tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs

@ -3,6 +3,7 @@
using System;
using SixLabors.ImageSharp.Compression.Zlib;
using SixLabors.ImageSharp.Tests.TestUtilities;
using Xunit;
using SharpAdler32 = ICSharpCode.SharpZipLib.Checksum.Adler32;
@ -15,10 +16,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Png
[InlineData(0)]
[InlineData(1)]
[InlineData(2)]
public void ReturnsCorrectWhenEmpty(uint input)
{
Assert.Equal(input, Adler32.Calculate(input, default));
}
public void CalculateAdler_ReturnsCorrectWhenEmpty(uint input) => Assert.Equal(input, Adler32.Calculate(input, default));
[Theory]
[InlineData(0)]
@ -28,24 +26,49 @@ namespace SixLabors.ImageSharp.Tests.Formats.Png
[InlineData(1024 + 15)]
[InlineData(2034)]
[InlineData(4096)]
public void MatchesReference(int length)
public void CalculateAdler_MatchesReference(int length) => CalculateAdlerAndCompareToReference(length);
private static void CalculateAdlerAndCompareToReference(int length)
{
var data = GetBuffer(length);
// arrange
byte[] data = GetBuffer(length);
var adler = new SharpAdler32();
adler.Update(data);
long expected = adler.Value;
// act
long actual = Adler32.Calculate(data);
// assert
Assert.Equal(expected, actual);
}
private static byte[] GetBuffer(int length)
{
var data = new byte[length];
byte[] data = new byte[length];
new Random(1).NextBytes(data);
return data;
}
#if SUPPORTS_RUNTIME_INTRINSICS
[Fact]
public void RunCalculateAdlerTest_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCalculateAdlerTest, HwIntrinsics.AllowAll);
[Fact]
public void RunCalculateAdlerTest_WithAvxDisabled_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCalculateAdlerTest, HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
[Fact]
public void RunCalculateAdlerTest_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCalculateAdlerTest, HwIntrinsics.DisableHWIntrinsic);
private static void RunCalculateAdlerTest()
{
int[] testData = { 0, 8, 215, 1024, 1024 + 15, 2034, 4096 };
for (int i = 0; i < testData.Length; i++)
{
CalculateAdlerAndCompareToReference(testData[i]);
}
}
#endif
}
}

Loading…
Cancel
Save