|
|
|
@ -3,12 +3,12 @@ |
|
|
|
|
|
|
|
using System; |
|
|
|
using System.Runtime.CompilerServices; |
|
|
|
using System.Runtime.InteropServices; |
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
using System.Runtime.Intrinsics; |
|
|
|
using System.Runtime.Intrinsics.X86; |
|
|
|
#endif
|
|
|
|
|
|
|
|
#pragma warning disable IDE0007 // Use implicit type
|
|
|
|
namespace SixLabors.ImageSharp.Formats.Png.Zlib |
|
|
|
{ |
|
|
|
/// <summary>
|
|
|
|
@ -22,16 +22,22 @@ namespace SixLabors.ImageSharp.Formats.Png.Zlib |
|
|
|
/// </summary>
|
|
|
|
public const uint SeedValue = 1U; |
|
|
|
|
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
private const int MinBufferSize = 64; |
|
|
|
#endif
|
|
|
|
|
|
|
|
// Largest prime smaller than 65536
|
|
|
|
private const uint BASE = 65521; |
|
|
|
|
|
|
|
// NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
|
|
|
|
private const uint NMAX = 5552; |
|
|
|
|
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
private const int MinBufferSize = 64; |
|
|
|
|
|
|
|
private static ReadOnlySpan<byte> Tap1Tap2 => new byte[] |
|
|
|
{ |
|
|
|
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, // tap1
|
|
|
|
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 // tap2
|
|
|
|
}; |
|
|
|
#endif
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Calculates the Adler32 checksum with the bytes taken from the span.
|
|
|
|
/// </summary>
|
|
|
|
@ -83,14 +89,15 @@ namespace SixLabors.ImageSharp.Formats.Png.Zlib |
|
|
|
length -= blocks * BLOCK_SIZE; |
|
|
|
|
|
|
|
int index = 0; |
|
|
|
fixed (byte* bufferPtr = &buffer[0]) |
|
|
|
fixed (byte* bufferPtr = buffer) |
|
|
|
fixed (byte* tapPtr = Tap1Tap2) |
|
|
|
{ |
|
|
|
index += (int)blocks * BLOCK_SIZE; |
|
|
|
var localBufferPtr = bufferPtr; |
|
|
|
|
|
|
|
// _mm_setr_epi8 on x86
|
|
|
|
var tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); |
|
|
|
var tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); |
|
|
|
Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr); |
|
|
|
Vector128<sbyte> tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10)); |
|
|
|
Vector128<byte> zero = Vector128<byte>.Zero; |
|
|
|
var ones = Vector128.Create((short)1); |
|
|
|
|
|
|
|
@ -106,28 +113,28 @@ namespace SixLabors.ImageSharp.Formats.Png.Zlib |
|
|
|
|
|
|
|
// Process n blocks of data. At most NMAX data bytes can be
|
|
|
|
// processed before s2 must be reduced modulo BASE.
|
|
|
|
Vector128<int> v_ps = Vector128.CreateScalar(s1 * n).AsInt32(); |
|
|
|
Vector128<int> v_s2 = Vector128.CreateScalar(s2).AsInt32(); |
|
|
|
Vector128<int> v_s1 = Vector128<int>.Zero; |
|
|
|
Vector128<uint> v_ps = Vector128.CreateScalar(s1 * n); |
|
|
|
Vector128<uint> v_s2 = Vector128.CreateScalar(s2); |
|
|
|
Vector128<uint> v_s1 = Vector128<uint>.Zero; |
|
|
|
|
|
|
|
do |
|
|
|
{ |
|
|
|
// Load 32 input bytes.
|
|
|
|
Vector128<byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); |
|
|
|
Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 16); |
|
|
|
Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10); |
|
|
|
|
|
|
|
// Add previous block byte sum to v_ps.
|
|
|
|
v_ps = Sse2.Add(v_ps, v_s1); |
|
|
|
|
|
|
|
// Horizontally add the bytes for s1, multiply-adds the
|
|
|
|
// bytes by [ 32, 31, 30, ... ] for s2.
|
|
|
|
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsInt32()); |
|
|
|
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32()); |
|
|
|
Vector128<short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); |
|
|
|
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones)); |
|
|
|
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32()); |
|
|
|
|
|
|
|
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsInt32()); |
|
|
|
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32()); |
|
|
|
Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); |
|
|
|
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones)); |
|
|
|
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); |
|
|
|
|
|
|
|
localBufferPtr += BLOCK_SIZE; |
|
|
|
} |
|
|
|
@ -139,148 +146,114 @@ namespace SixLabors.ImageSharp.Formats.Png.Zlib |
|
|
|
const byte S2301 = 0b1011_0001; // A B C D -> B A D C
|
|
|
|
const byte S1032 = 0b0100_1110; // A B C D -> C D A B
|
|
|
|
|
|
|
|
v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301)); |
|
|
|
v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); |
|
|
|
|
|
|
|
s1 += (uint)v_s1.ToScalar(); |
|
|
|
s1 += v_s1.ToScalar(); |
|
|
|
|
|
|
|
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); |
|
|
|
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); |
|
|
|
|
|
|
|
s2 = (uint)v_s2.ToScalar(); |
|
|
|
s2 = v_s2.ToScalar(); |
|
|
|
|
|
|
|
// Reduce.
|
|
|
|
s1 %= BASE; |
|
|
|
s2 %= BASE; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
ref byte bufferRef = ref MemoryMarshal.GetReference(buffer); |
|
|
|
|
|
|
|
if (length > 0) |
|
|
|
{ |
|
|
|
if (length >= 16) |
|
|
|
if (length > 0) |
|
|
|
{ |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
length -= 16; |
|
|
|
} |
|
|
|
if (length >= 16) |
|
|
|
{ |
|
|
|
s2 += s1 += localBufferPtr[0]; |
|
|
|
s2 += s1 += localBufferPtr[1]; |
|
|
|
s2 += s1 += localBufferPtr[2]; |
|
|
|
s2 += s1 += localBufferPtr[3]; |
|
|
|
s2 += s1 += localBufferPtr[4]; |
|
|
|
s2 += s1 += localBufferPtr[5]; |
|
|
|
s2 += s1 += localBufferPtr[6]; |
|
|
|
s2 += s1 += localBufferPtr[7]; |
|
|
|
s2 += s1 += localBufferPtr[8]; |
|
|
|
s2 += s1 += localBufferPtr[9]; |
|
|
|
s2 += s1 += localBufferPtr[10]; |
|
|
|
s2 += s1 += localBufferPtr[11]; |
|
|
|
s2 += s1 += localBufferPtr[12]; |
|
|
|
s2 += s1 += localBufferPtr[13]; |
|
|
|
s2 += s1 += localBufferPtr[14]; |
|
|
|
s2 += s1 += localBufferPtr[15]; |
|
|
|
|
|
|
|
localBufferPtr += 16; |
|
|
|
length -= 16; |
|
|
|
} |
|
|
|
|
|
|
|
while (length-- > 0) |
|
|
|
{ |
|
|
|
s2 += s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
} |
|
|
|
while (length-- > 0) |
|
|
|
{ |
|
|
|
s2 += s1 += *localBufferPtr++; |
|
|
|
} |
|
|
|
|
|
|
|
if (s1 >= BASE) |
|
|
|
{ |
|
|
|
s1 -= BASE; |
|
|
|
if (s1 >= BASE) |
|
|
|
{ |
|
|
|
s1 -= BASE; |
|
|
|
} |
|
|
|
|
|
|
|
s2 %= BASE; |
|
|
|
} |
|
|
|
|
|
|
|
s2 %= BASE; |
|
|
|
return s1 | (s2 << 16); |
|
|
|
} |
|
|
|
|
|
|
|
return s1 | (s2 << 16); |
|
|
|
} |
|
|
|
#endif
|
|
|
|
|
|
|
|
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] |
|
|
|
private static uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer) |
|
|
|
private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer) |
|
|
|
{ |
|
|
|
uint s1 = adler & 0xFFFF; |
|
|
|
uint s2 = (adler >> 16) & 0xFFFF; |
|
|
|
uint k; |
|
|
|
|
|
|
|
ref byte bufferRef = ref MemoryMarshal.GetReference<byte>(buffer); |
|
|
|
uint length = (uint)buffer.Length; |
|
|
|
int index = 0; |
|
|
|
|
|
|
|
while (length > 0) |
|
|
|
fixed (byte* bufferPtr = buffer) |
|
|
|
{ |
|
|
|
k = length < NMAX ? length : NMAX; |
|
|
|
length -= k; |
|
|
|
var localBufferPtr = bufferPtr; |
|
|
|
uint length = (uint)buffer.Length; |
|
|
|
|
|
|
|
while (k >= 16) |
|
|
|
while (length > 0) |
|
|
|
{ |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
k -= 16; |
|
|
|
} |
|
|
|
k = length < NMAX ? length : NMAX; |
|
|
|
length -= k; |
|
|
|
|
|
|
|
if (k != 0) |
|
|
|
{ |
|
|
|
do |
|
|
|
while (k >= 16) |
|
|
|
{ |
|
|
|
s2 += s1 += localBufferPtr[0]; |
|
|
|
s2 += s1 += localBufferPtr[1]; |
|
|
|
s2 += s1 += localBufferPtr[2]; |
|
|
|
s2 += s1 += localBufferPtr[3]; |
|
|
|
s2 += s1 += localBufferPtr[4]; |
|
|
|
s2 += s1 += localBufferPtr[5]; |
|
|
|
s2 += s1 += localBufferPtr[6]; |
|
|
|
s2 += s1 += localBufferPtr[7]; |
|
|
|
s2 += s1 += localBufferPtr[8]; |
|
|
|
s2 += s1 += localBufferPtr[9]; |
|
|
|
s2 += s1 += localBufferPtr[10]; |
|
|
|
s2 += s1 += localBufferPtr[11]; |
|
|
|
s2 += s1 += localBufferPtr[12]; |
|
|
|
s2 += s1 += localBufferPtr[13]; |
|
|
|
s2 += s1 += localBufferPtr[14]; |
|
|
|
s2 += s1 += localBufferPtr[15]; |
|
|
|
|
|
|
|
localBufferPtr += 16; |
|
|
|
k -= 16; |
|
|
|
} |
|
|
|
|
|
|
|
while (k-- > 0) |
|
|
|
{ |
|
|
|
s1 += Unsafe.Add(ref bufferRef, index++); |
|
|
|
s2 += s1; |
|
|
|
s2 += s1 += *localBufferPtr++; |
|
|
|
} |
|
|
|
while (--k != 0); |
|
|
|
|
|
|
|
s1 %= BASE; |
|
|
|
s2 %= BASE; |
|
|
|
} |
|
|
|
|
|
|
|
s1 %= BASE; |
|
|
|
s2 %= BASE; |
|
|
|
return (s2 << 16) | s1; |
|
|
|
} |
|
|
|
|
|
|
|
return (s2 << 16) | s1; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|