Browse Source

Fix missing braces

pull/1777/head
James Jackson-South 4 years ago
parent
commit
d2b205260e
  1. 170
      src/ImageSharp/Compression/Zlib/Adler32.cs
  2. 180
      src/ImageSharp/Compression/Zlib/Crc32.cs

170
src/ImageSharp/Compression/Zlib/Adler32.cs

@ -91,115 +91,117 @@ namespace SixLabors.ImageSharp.Compression.Zlib
int index = 0;
fixed (byte* bufferPtr = buffer)
fixed (byte* tapPtr = Tap1Tap2)
{
index += (int)blocks * BLOCK_SIZE;
var localBufferPtr = bufferPtr;
// _mm_setr_epi8 on x86
Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
Vector128<sbyte> tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10));
Vector128<byte> zero = Vector128<byte>.Zero;
var ones = Vector128.Create((short)1);
while (blocks > 0)
fixed (byte* tapPtr = Tap1Tap2)
{
uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
if (n > blocks)
{
n = blocks;
}
index += (int)blocks * BLOCK_SIZE;
var localBufferPtr = bufferPtr;
blocks -= n;
// _mm_setr_epi8 on x86
Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
Vector128<sbyte> tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10));
Vector128<byte> zero = Vector128<byte>.Zero;
var ones = Vector128.Create((short)1);
// Process n blocks of data. At most NMAX data bytes can be
// processed before s2 must be reduced modulo BASE.
Vector128<uint> v_ps = Vector128.CreateScalar(s1 * n);
Vector128<uint> v_s2 = Vector128.CreateScalar(s2);
Vector128<uint> v_s1 = Vector128<uint>.Zero;
do
while (blocks > 0)
{
// Load 32 input bytes.
Vector128<byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);
uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
if (n > blocks)
{
n = blocks;
}
// Add previous block byte sum to v_ps.
v_ps = Sse2.Add(v_ps, v_s1);
blocks -= n;
// Horizontally add the bytes for s1, multiply-adds the
// bytes by [ 32, 31, 30, ... ] for s2.
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
Vector128<short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());
// Process n blocks of data. At most NMAX data bytes can be
// processed before s2 must be reduced modulo BASE.
Vector128<uint> v_ps = Vector128.CreateScalar(s1 * n);
Vector128<uint> v_s2 = Vector128.CreateScalar(s2);
Vector128<uint> v_s1 = Vector128<uint>.Zero;
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());
do
{
// Load 32 input bytes.
Vector128<byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);
localBufferPtr += BLOCK_SIZE;
}
while (--n > 0);
// Add previous block byte sum to v_ps.
v_ps = Sse2.Add(v_ps, v_s1);
v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));
// Horizontally add the bytes for s1, multiply-adds the
// bytes by [ 32, 31, 30, ... ] for s2.
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
Vector128<short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());
// Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
const byte S2301 = 0b1011_0001; // A B C D -> B A D C
const byte S1032 = 0b0100_1110; // A B C D -> C D A B
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());
v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));
localBufferPtr += BLOCK_SIZE;
}
while (--n > 0);
s1 += v_s1.ToScalar();
v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));
// Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
const byte S2301 = 0b1011_0001; // A B C D -> B A D C
const byte S1032 = 0b0100_1110; // A B C D -> C D A B
s2 = v_s2.ToScalar();
v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));
// Reduce.
s1 %= BASE;
s2 %= BASE;
}
s1 += v_s1.ToScalar();
if (length > 0)
{
if (length >= 16)
{
s2 += s1 += localBufferPtr[0];
s2 += s1 += localBufferPtr[1];
s2 += s1 += localBufferPtr[2];
s2 += s1 += localBufferPtr[3];
s2 += s1 += localBufferPtr[4];
s2 += s1 += localBufferPtr[5];
s2 += s1 += localBufferPtr[6];
s2 += s1 += localBufferPtr[7];
s2 += s1 += localBufferPtr[8];
s2 += s1 += localBufferPtr[9];
s2 += s1 += localBufferPtr[10];
s2 += s1 += localBufferPtr[11];
s2 += s1 += localBufferPtr[12];
s2 += s1 += localBufferPtr[13];
s2 += s1 += localBufferPtr[14];
s2 += s1 += localBufferPtr[15];
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));
localBufferPtr += 16;
length -= 16;
}
s2 = v_s2.ToScalar();
while (length-- > 0)
{
s2 += s1 += *localBufferPtr++;
// Reduce.
s1 %= BASE;
s2 %= BASE;
}
if (s1 >= BASE)
if (length > 0)
{
s1 -= BASE;
if (length >= 16)
{
s2 += s1 += localBufferPtr[0];
s2 += s1 += localBufferPtr[1];
s2 += s1 += localBufferPtr[2];
s2 += s1 += localBufferPtr[3];
s2 += s1 += localBufferPtr[4];
s2 += s1 += localBufferPtr[5];
s2 += s1 += localBufferPtr[6];
s2 += s1 += localBufferPtr[7];
s2 += s1 += localBufferPtr[8];
s2 += s1 += localBufferPtr[9];
s2 += s1 += localBufferPtr[10];
s2 += s1 += localBufferPtr[11];
s2 += s1 += localBufferPtr[12];
s2 += s1 += localBufferPtr[13];
s2 += s1 += localBufferPtr[14];
s2 += s1 += localBufferPtr[15];
localBufferPtr += 16;
length -= 16;
}
while (length-- > 0)
{
s2 += s1 += *localBufferPtr++;
}
if (s1 >= BASE)
{
s1 -= BASE;
}
s2 %= BASE;
}
s2 %= BASE;
return s1 | (s2 << 16);
}
return s1 | (s2 << 16);
}
}
#endif

180
src/ImageSharp/Compression/Zlib/Crc32.cs

@ -83,117 +83,119 @@ namespace SixLabors.ImageSharp.Compression.Zlib
int length = chunksize;
fixed (byte* bufferPtr = buffer)
fixed (ulong* k05PolyPtr = K05Poly)
{
byte* localBufferPtr = bufferPtr;
ulong* localK05PolyPtr = k05PolyPtr;
// There's at least one block of 64.
Vector128<ulong> x1 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00));
Vector128<ulong> x2 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10));
Vector128<ulong> x3 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20));
Vector128<ulong> x4 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30));
Vector128<ulong> x5;
x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64());
// k1, k2
Vector128<ulong> x0 = Sse2.LoadVector128(localK05PolyPtr + 0x0);
localBufferPtr += 64;
length -= 64;
// Parallel fold blocks of 64, if any.
while (length >= 64)
fixed (ulong* k05PolyPtr = K05Poly)
{
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
Vector128<ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
Vector128<ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00);
Vector128<ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11);
x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11);
x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11);
byte* localBufferPtr = bufferPtr;
ulong* localK05PolyPtr = k05PolyPtr;
Vector128<ulong> y5 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00));
Vector128<ulong> y6 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10));
Vector128<ulong> y7 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20));
Vector128<ulong> y8 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30));
// There's at least one block of 64.
Vector128<ulong> x1 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00));
Vector128<ulong> x2 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10));
Vector128<ulong> x3 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20));
Vector128<ulong> x4 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30));
Vector128<ulong> x5;
x1 = Sse2.Xor(x1, x5);
x2 = Sse2.Xor(x2, x6);
x3 = Sse2.Xor(x3, x7);
x4 = Sse2.Xor(x4, x8);
x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64());
x1 = Sse2.Xor(x1, y5);
x2 = Sse2.Xor(x2, y6);
x3 = Sse2.Xor(x3, y7);
x4 = Sse2.Xor(x4, y8);
// k1, k2
Vector128<ulong> x0 = Sse2.LoadVector128(localK05PolyPtr + 0x0);
localBufferPtr += 64;
length -= 64;
}
// Fold into 128-bits.
// k3, k4
x0 = Sse2.LoadVector128(k05PolyPtr + 0x2);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x1 = Sse2.Xor(x1, x2);
x1 = Sse2.Xor(x1, x5);
// Parallel fold blocks of 64, if any.
while (length >= 64)
{
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
Vector128<ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
Vector128<ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00);
Vector128<ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11);
x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11);
x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11);
Vector128<ulong> y5 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00));
Vector128<ulong> y6 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10));
Vector128<ulong> y7 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20));
Vector128<ulong> y8 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30));
x1 = Sse2.Xor(x1, x5);
x2 = Sse2.Xor(x2, x6);
x3 = Sse2.Xor(x3, x7);
x4 = Sse2.Xor(x4, x8);
x1 = Sse2.Xor(x1, y5);
x2 = Sse2.Xor(x2, y6);
x3 = Sse2.Xor(x3, y7);
x4 = Sse2.Xor(x4, y8);
localBufferPtr += 64;
length -= 64;
}
// Fold into 128-bits.
// k3, k4
x0 = Sse2.LoadVector128(k05PolyPtr + 0x2);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x1 = Sse2.Xor(x1, x3);
x1 = Sse2.Xor(x1, x5);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x1 = Sse2.Xor(x1, x4);
x1 = Sse2.Xor(x1, x5);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x1 = Sse2.Xor(x1, x2);
x1 = Sse2.Xor(x1, x5);
// Single fold blocks of 16, if any.
while (length >= 16)
{
x2 = Sse2.LoadVector128((ulong*)localBufferPtr);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x1 = Sse2.Xor(x1, x3);
x1 = Sse2.Xor(x1, x5);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x1 = Sse2.Xor(x1, x2);
x1 = Sse2.Xor(x1, x4);
x1 = Sse2.Xor(x1, x5);
localBufferPtr += 16;
length -= 16;
}
// Single fold blocks of 16, if any.
while (length >= 16)
{
x2 = Sse2.LoadVector128((ulong*)localBufferPtr);
x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
x1 = Sse2.Xor(x1, x2);
x1 = Sse2.Xor(x1, x5);
// Fold 128 - bits to 64 - bits.
x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10);
x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86
x1 = Sse2.ShiftRightLogical128BitLane(x1, 8);
x1 = Sse2.Xor(x1, x2);
localBufferPtr += 16;
length -= 16;
}
// k5, k0
x0 = Sse2.LoadScalarVector128(localK05PolyPtr + 0x4);
// Fold 128 - bits to 64 - bits.
x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10);
x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86
x1 = Sse2.ShiftRightLogical128BitLane(x1, 8);
x1 = Sse2.Xor(x1, x2);
x2 = Sse2.ShiftRightLogical128BitLane(x1, 4);
x1 = Sse2.And(x1, x3);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Sse2.Xor(x1, x2);
// k5, k0
x0 = Sse2.LoadScalarVector128(localK05PolyPtr + 0x4);
// Barret reduce to 32-bits.
// polynomial
x0 = Sse2.LoadVector128(localK05PolyPtr + 0x6);
x2 = Sse2.ShiftRightLogical128BitLane(x1, 4);
x1 = Sse2.And(x1, x3);
x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
x1 = Sse2.Xor(x1, x2);
x2 = Sse2.And(x1, x3);
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10);
x2 = Sse2.And(x2, x3);
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
x1 = Sse2.Xor(x1, x2);
// Barret reduce to 32-bits.
// polynomial
x0 = Sse2.LoadVector128(localK05PolyPtr + 0x6);
crc = (uint)Sse41.Extract(x1.AsInt32(), 1);
return buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer.Slice(chunksize));
x2 = Sse2.And(x1, x3);
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10);
x2 = Sse2.And(x2, x3);
x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
x1 = Sse2.Xor(x1, x2);
crc = (uint)Sse41.Extract(x1.AsInt32(), 1);
return buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer.Slice(chunksize));
}
}
}
#endif

Loading…
Cancel
Save