From d2b205260ef275eeea8d2bf13f1db00fce0d2c80 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 5 Oct 2021 23:34:06 +1100 Subject: [PATCH] Fix missing braces --- src/ImageSharp/Compression/Zlib/Adler32.cs | 170 +++++++++---------- src/ImageSharp/Compression/Zlib/Crc32.cs | 180 +++++++++++---------- 2 files changed, 177 insertions(+), 173 deletions(-) diff --git a/src/ImageSharp/Compression/Zlib/Adler32.cs b/src/ImageSharp/Compression/Zlib/Adler32.cs index 9b3abd298..7eb3f4516 100644 --- a/src/ImageSharp/Compression/Zlib/Adler32.cs +++ b/src/ImageSharp/Compression/Zlib/Adler32.cs @@ -91,115 +91,117 @@ namespace SixLabors.ImageSharp.Compression.Zlib int index = 0; fixed (byte* bufferPtr = buffer) - fixed (byte* tapPtr = Tap1Tap2) { - index += (int)blocks * BLOCK_SIZE; - var localBufferPtr = bufferPtr; - - // _mm_setr_epi8 on x86 - Vector128 tap1 = Sse2.LoadVector128((sbyte*)tapPtr); - Vector128 tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10)); - Vector128 zero = Vector128.Zero; - var ones = Vector128.Create((short)1); - - while (blocks > 0) + fixed (byte* tapPtr = Tap1Tap2) { - uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ - if (n > blocks) - { - n = blocks; - } + index += (int)blocks * BLOCK_SIZE; + var localBufferPtr = bufferPtr; - blocks -= n; + // _mm_setr_epi8 on x86 + Vector128 tap1 = Sse2.LoadVector128((sbyte*)tapPtr); + Vector128 tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10)); + Vector128 zero = Vector128.Zero; + var ones = Vector128.Create((short)1); - // Process n blocks of data. At most NMAX data bytes can be - // processed before s2 must be reduced modulo BASE. - Vector128 v_ps = Vector128.CreateScalar(s1 * n); - Vector128 v_s2 = Vector128.CreateScalar(s2); - Vector128 v_s1 = Vector128.Zero; - - do + while (blocks > 0) { - // Load 32 input bytes. - Vector128 bytes1 = Sse3.LoadDquVector128(localBufferPtr); - Vector128 bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10); + uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + if (n > blocks) + { + n = blocks; + } - // Add previous block byte sum to v_ps. - v_ps = Sse2.Add(v_ps, v_s1); + blocks -= n; - // Horizontally add the bytes for s1, multiply-adds the - // bytes by [ 32, 31, 30, ... ] for s2. - v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32()); - Vector128 mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); - v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32()); + // Process n blocks of data. At most NMAX data bytes can be + // processed before s2 must be reduced modulo BASE. + Vector128 v_ps = Vector128.CreateScalar(s1 * n); + Vector128 v_s2 = Vector128.CreateScalar(s2); + Vector128 v_s1 = Vector128.Zero; - v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32()); - Vector128 mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); - v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); + do + { + // Load 32 input bytes. + Vector128 bytes1 = Sse3.LoadDquVector128(localBufferPtr); + Vector128 bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10); - localBufferPtr += BLOCK_SIZE; - } - while (--n > 0); + // Add previous block byte sum to v_ps. + v_ps = Sse2.Add(v_ps, v_s1); - v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); + // Horizontally add the bytes for s1, multiply-adds the + // bytes by [ 32, 31, 30, ... ] for s2. + v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32()); + Vector128 mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); + v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32()); - // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). - const byte S2301 = 0b1011_0001; // A B C D -> B A D C - const byte S1032 = 0b0100_1110; // A B C D -> C D A B + v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32()); + Vector128 mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); + v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); - v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); + localBufferPtr += BLOCK_SIZE; + } + while (--n > 0); - s1 += v_s1.ToScalar(); + v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); - v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); - v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); + // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). + const byte S2301 = 0b1011_0001; // A B C D -> B A D C + const byte S1032 = 0b0100_1110; // A B C D -> C D A B - s2 = v_s2.ToScalar(); + v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); - // Reduce. - s1 %= BASE; - s2 %= BASE; - } + s1 += v_s1.ToScalar(); - if (length > 0) - { - if (length >= 16) - { - s2 += s1 += localBufferPtr[0]; - s2 += s1 += localBufferPtr[1]; - s2 += s1 += localBufferPtr[2]; - s2 += s1 += localBufferPtr[3]; - s2 += s1 += localBufferPtr[4]; - s2 += s1 += localBufferPtr[5]; - s2 += s1 += localBufferPtr[6]; - s2 += s1 += localBufferPtr[7]; - s2 += s1 += localBufferPtr[8]; - s2 += s1 += localBufferPtr[9]; - s2 += s1 += localBufferPtr[10]; - s2 += s1 += localBufferPtr[11]; - s2 += s1 += localBufferPtr[12]; - s2 += s1 += localBufferPtr[13]; - s2 += s1 += localBufferPtr[14]; - s2 += s1 += localBufferPtr[15]; + v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); + v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); - localBufferPtr += 16; - length -= 16; - } + s2 = v_s2.ToScalar(); - while (length-- > 0) - { - s2 += s1 += *localBufferPtr++; + // Reduce. + s1 %= BASE; + s2 %= BASE; } - if (s1 >= BASE) + if (length > 0) { - s1 -= BASE; + if (length >= 16) + { + s2 += s1 += localBufferPtr[0]; + s2 += s1 += localBufferPtr[1]; + s2 += s1 += localBufferPtr[2]; + s2 += s1 += localBufferPtr[3]; + s2 += s1 += localBufferPtr[4]; + s2 += s1 += localBufferPtr[5]; + s2 += s1 += localBufferPtr[6]; + s2 += s1 += localBufferPtr[7]; + s2 += s1 += localBufferPtr[8]; + s2 += s1 += localBufferPtr[9]; + s2 += s1 += localBufferPtr[10]; + s2 += s1 += localBufferPtr[11]; + s2 += s1 += localBufferPtr[12]; + s2 += s1 += localBufferPtr[13]; + s2 += s1 += localBufferPtr[14]; + s2 += s1 += localBufferPtr[15]; + + localBufferPtr += 16; + length -= 16; + } + + while (length-- > 0) + { + s2 += s1 += *localBufferPtr++; + } + + if (s1 >= BASE) + { + s1 -= BASE; + } + + s2 %= BASE; } - s2 %= BASE; + return s1 | (s2 << 16); } - - return s1 | (s2 << 16); } } #endif diff --git a/src/ImageSharp/Compression/Zlib/Crc32.cs b/src/ImageSharp/Compression/Zlib/Crc32.cs index 0ba368df6..075d6112a 100644 --- a/src/ImageSharp/Compression/Zlib/Crc32.cs +++ b/src/ImageSharp/Compression/Zlib/Crc32.cs @@ -83,117 +83,119 @@ namespace SixLabors.ImageSharp.Compression.Zlib int length = chunksize; fixed (byte* bufferPtr = buffer) - fixed (ulong* k05PolyPtr = K05Poly) { - byte* localBufferPtr = bufferPtr; - ulong* localK05PolyPtr = k05PolyPtr; - - // There's at least one block of 64. - Vector128 x1 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00)); - Vector128 x2 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10)); - Vector128 x3 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20)); - Vector128 x4 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30)); - Vector128 x5; - - x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64()); - - // k1, k2 - Vector128 x0 = Sse2.LoadVector128(localK05PolyPtr + 0x0); - - localBufferPtr += 64; - length -= 64; - - // Parallel fold blocks of 64, if any. - while (length >= 64) + fixed (ulong* k05PolyPtr = K05Poly) { - x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); - Vector128 x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); - Vector128 x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00); - Vector128 x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00); - - x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); - x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11); - x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11); - x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11); + byte* localBufferPtr = bufferPtr; + ulong* localK05PolyPtr = k05PolyPtr; - Vector128 y5 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00)); - Vector128 y6 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10)); - Vector128 y7 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20)); - Vector128 y8 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30)); + // There's at least one block of 64. + Vector128 x1 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00)); + Vector128 x2 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10)); + Vector128 x3 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20)); + Vector128 x4 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30)); + Vector128 x5; - x1 = Sse2.Xor(x1, x5); - x2 = Sse2.Xor(x2, x6); - x3 = Sse2.Xor(x3, x7); - x4 = Sse2.Xor(x4, x8); + x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64()); - x1 = Sse2.Xor(x1, y5); - x2 = Sse2.Xor(x2, y6); - x3 = Sse2.Xor(x3, y7); - x4 = Sse2.Xor(x4, y8); + // k1, k2 + Vector128 x0 = Sse2.LoadVector128(localK05PolyPtr + 0x0); localBufferPtr += 64; length -= 64; - } - - // Fold into 128-bits. - // k3, k4 - x0 = Sse2.LoadVector128(k05PolyPtr + 0x2); - x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); - x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); - x1 = Sse2.Xor(x1, x2); - x1 = Sse2.Xor(x1, x5); + // Parallel fold blocks of 64, if any. + while (length >= 64) + { + x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); + Vector128 x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); + Vector128 x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00); + Vector128 x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00); + + x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); + x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11); + x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11); + x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11); + + Vector128 y5 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00)); + Vector128 y6 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10)); + Vector128 y7 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20)); + Vector128 y8 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30)); + + x1 = Sse2.Xor(x1, x5); + x2 = Sse2.Xor(x2, x6); + x3 = Sse2.Xor(x3, x7); + x4 = Sse2.Xor(x4, x8); + + x1 = Sse2.Xor(x1, y5); + x2 = Sse2.Xor(x2, y6); + x3 = Sse2.Xor(x3, y7); + x4 = Sse2.Xor(x4, y8); + + localBufferPtr += 64; + length -= 64; + } + + // Fold into 128-bits. + // k3, k4 + x0 = Sse2.LoadVector128(k05PolyPtr + 0x2); - x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); - x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); - x1 = Sse2.Xor(x1, x3); - x1 = Sse2.Xor(x1, x5); - - x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); - x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); - x1 = Sse2.Xor(x1, x4); - x1 = Sse2.Xor(x1, x5); + x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); + x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); + x1 = Sse2.Xor(x1, x2); + x1 = Sse2.Xor(x1, x5); - // Single fold blocks of 16, if any. - while (length >= 16) - { - x2 = Sse2.LoadVector128((ulong*)localBufferPtr); + x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); + x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); + x1 = Sse2.Xor(x1, x3); + x1 = Sse2.Xor(x1, x5); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); - x1 = Sse2.Xor(x1, x2); + x1 = Sse2.Xor(x1, x4); x1 = Sse2.Xor(x1, x5); - localBufferPtr += 16; - length -= 16; - } + // Single fold blocks of 16, if any. + while (length >= 16) + { + x2 = Sse2.LoadVector128((ulong*)localBufferPtr); + + x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); + x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); + x1 = Sse2.Xor(x1, x2); + x1 = Sse2.Xor(x1, x5); - // Fold 128 - bits to 64 - bits. - x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10); - x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86 - x1 = Sse2.ShiftRightLogical128BitLane(x1, 8); - x1 = Sse2.Xor(x1, x2); + localBufferPtr += 16; + length -= 16; + } - // k5, k0 - x0 = Sse2.LoadScalarVector128(localK05PolyPtr + 0x4); + // Fold 128 - bits to 64 - bits. + x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10); + x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86 + x1 = Sse2.ShiftRightLogical128BitLane(x1, 8); + x1 = Sse2.Xor(x1, x2); - x2 = Sse2.ShiftRightLogical128BitLane(x1, 4); - x1 = Sse2.And(x1, x3); - x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); - x1 = Sse2.Xor(x1, x2); + // k5, k0 + x0 = Sse2.LoadScalarVector128(localK05PolyPtr + 0x4); - // Barret reduce to 32-bits. - // polynomial - x0 = Sse2.LoadVector128(localK05PolyPtr + 0x6); + x2 = Sse2.ShiftRightLogical128BitLane(x1, 4); + x1 = Sse2.And(x1, x3); + x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); + x1 = Sse2.Xor(x1, x2); - x2 = Sse2.And(x1, x3); - x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10); - x2 = Sse2.And(x2, x3); - x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); - x1 = Sse2.Xor(x1, x2); + // Barret reduce to 32-bits. + // polynomial + x0 = Sse2.LoadVector128(localK05PolyPtr + 0x6); - crc = (uint)Sse41.Extract(x1.AsInt32(), 1); - return buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer.Slice(chunksize)); + x2 = Sse2.And(x1, x3); + x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10); + x2 = Sse2.And(x2, x3); + x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); + x1 = Sse2.Xor(x1, x2); + + crc = (uint)Sse41.Extract(x1.AsInt32(), 1); + return buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer.Slice(chunksize)); + } } } #endif