|
|
|
@ -9,6 +9,8 @@ using System.IO.Compression; |
|
|
|
using System.IO.Hashing; |
|
|
|
using System.Runtime.CompilerServices; |
|
|
|
using System.Runtime.InteropServices; |
|
|
|
using System.Runtime.Intrinsics; |
|
|
|
using System.Runtime.Intrinsics.X86; |
|
|
|
using System.Text; |
|
|
|
using SixLabors.ImageSharp.Common.Helpers; |
|
|
|
using SixLabors.ImageSharp.Compression.Zlib; |
|
|
|
@ -900,7 +902,7 @@ internal sealed class PngDecoderCore : ImageDecoderCore |
|
|
|
|
|
|
|
if (this.isCgbi) |
|
|
|
{ |
|
|
|
ApplyCgbiTransform(scanSpan[1..], this.pngColorType); |
|
|
|
this.ApplyCgbiTransform(scanSpan[1..], this.pngColorType); |
|
|
|
} |
|
|
|
|
|
|
|
this.ProcessDefilteredScanline(frameControl, currentRow, scanSpan, imageFrame, pngMetadata, blendRowBuffer); |
|
|
|
@ -1035,7 +1037,7 @@ internal sealed class PngDecoderCore : ImageDecoderCore |
|
|
|
|
|
|
|
if (this.isCgbi) |
|
|
|
{ |
|
|
|
ApplyCgbiTransform(scanSpan[1..], this.pngColorType); |
|
|
|
this.ApplyCgbiTransform(scanSpan[1..], this.pngColorType); |
|
|
|
} |
|
|
|
|
|
|
|
Span<TPixel> rowSpan = imageBuffer.DangerousGetRowSpan(currentRow); |
|
|
|
@ -2505,39 +2507,289 @@ internal sealed class PngDecoderCore : ImageDecoderCore |
|
|
|
/// </remarks>
|
|
|
|
/// <param name="scanline">The defiltered pixel bytes (without the leading filter byte).</param>
|
|
|
|
/// <param name="colorType">The PNG color type from IHDR.</param>
|
|
|
|
private static void ApplyCgbiTransform(Span<byte> scanline, PngColorType colorType) |
|
|
|
private void ApplyCgbiTransform(Span<byte> scanline, PngColorType colorType) |
|
|
|
{ |
|
|
|
if (colorType == PngColorType.RgbWithAlpha) |
|
|
|
{ |
|
|
|
Span<Rgba32> pixels = MemoryMarshal.Cast<byte, Rgba32>(scanline); |
|
|
|
for (int i = 0; i < pixels.Length; i++) |
|
|
|
int i = 0; |
|
|
|
|
|
|
|
if (Vector512.IsHardwareAccelerated && pixels.Length >= 16) |
|
|
|
{ |
|
|
|
ref Rgba32 p = ref pixels[i]; |
|
|
|
byte r = p.B; |
|
|
|
byte g = p.G; |
|
|
|
byte b = p.R; |
|
|
|
byte a = p.A; |
|
|
|
i = ApplyCgbiTransformVector512(scanline, pixels.Length); |
|
|
|
} |
|
|
|
|
|
|
|
if (a is not 0 and not byte.MaxValue) |
|
|
|
{ |
|
|
|
// Reverse: c' = c * a / 255 => c = round(c' * 255 / a)
|
|
|
|
int half = a >> 1; |
|
|
|
r = (byte)Math.Min(byte.MaxValue, ((r * byte.MaxValue) + half) / a); |
|
|
|
g = (byte)Math.Min(byte.MaxValue, ((g * byte.MaxValue) + half) / a); |
|
|
|
b = (byte)Math.Min(byte.MaxValue, ((b * byte.MaxValue) + half) / a); |
|
|
|
} |
|
|
|
if (Vector256.IsHardwareAccelerated && Avx2.IsSupported && (pixels.Length - i) >= 8) |
|
|
|
{ |
|
|
|
i = ApplyCgbiTransformVector256(scanline, i, pixels.Length); |
|
|
|
} |
|
|
|
|
|
|
|
p = new Rgba32(r, g, b, a); |
|
|
|
if (Vector128.IsHardwareAccelerated && (pixels.Length - i) >= 4) |
|
|
|
{ |
|
|
|
i = ApplyCgbiTransformVector128(scanline, i, pixels.Length); |
|
|
|
} |
|
|
|
|
|
|
|
for (; i < pixels.Length; i++) |
|
|
|
{ |
|
|
|
ref Rgba32 pixel = ref pixels[i]; |
|
|
|
pixel = new Rgba32(pixel.B, pixel.G, pixel.R, pixel.A); |
|
|
|
UndoCgbiPremultiplicationScalar(ref pixel); |
|
|
|
} |
|
|
|
} |
|
|
|
else if (colorType == PngColorType.Rgb) |
|
|
|
{ |
|
|
|
Span<Rgb24> pixels = MemoryMarshal.Cast<byte, Rgb24>(scanline); |
|
|
|
for (int i = 0; i < pixels.Length; i++) |
|
|
|
{ |
|
|
|
ref Rgb24 p = ref pixels[i]; |
|
|
|
(p.R, p.B) = (p.B, p.R); |
|
|
|
} |
|
|
|
// No alpha channel, so just swap R and B using built in SIMD-optimized pixel operations.
|
|
|
|
Span<Rgb24> target = MemoryMarshal.Cast<byte, Rgb24>(scanline); |
|
|
|
PixelOperations<Rgb24>.Instance.FromBgr24Bytes(this.configuration, scanline, target, target.Length); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
private static void UndoCgbiPremultiplicationScalar(ref Rgba32 pixel) |
|
|
|
{ |
|
|
|
byte a = pixel.A; |
|
|
|
if (a is 0 or byte.MaxValue) |
|
|
|
{ |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
// Reverse: c' = c * a / 255 => c = round(c' * 255 / a)
|
|
|
|
int half = a >> 1; |
|
|
|
byte r = (byte)Math.Min(byte.MaxValue, ((pixel.R * byte.MaxValue) + half) / a); |
|
|
|
byte g = (byte)Math.Min(byte.MaxValue, ((pixel.G * byte.MaxValue) + half) / a); |
|
|
|
byte b = (byte)Math.Min(byte.MaxValue, ((pixel.B * byte.MaxValue) + half) / a); |
|
|
|
pixel = new Rgba32(r, g, b, a); |
|
|
|
} |
|
|
|
|
|
|
|
private static int ApplyCgbiTransformVector512(Span<byte> scanline, int pixelCount) |
|
|
|
{ |
|
|
|
ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline); |
|
|
|
int i = 0; |
|
|
|
|
|
|
|
Span<byte> temp = stackalloc byte[Vector512<byte>.Count]; |
|
|
|
SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012); |
|
|
|
|
|
|
|
// MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting
|
|
|
|
// CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place.
|
|
|
|
// The generated mask only swaps bytes inside each pixel, so it remains
|
|
|
|
// correct for the optimized 512-bit byte shuffle helper.
|
|
|
|
Vector512<byte> shuffleMask = Unsafe.As<byte, Vector512<byte>>(ref MemoryMarshal.GetReference(temp)); |
|
|
|
|
|
|
|
Vector512<int> zero = Vector512<int>.Zero; |
|
|
|
Vector512<int> one = Vector512<int>.One; |
|
|
|
Vector512<int> byteMask = Vector512.Create(0xFF); |
|
|
|
Vector512<int> opaque = Vector512.Create(0xFF); |
|
|
|
Vector512<int> byteMax = Vector512.Create((int)byte.MaxValue); |
|
|
|
|
|
|
|
for (; i <= pixelCount - 16; i += 16) |
|
|
|
{ |
|
|
|
ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf<Rgba32>()); |
|
|
|
Vector512<byte> bgra = Unsafe.ReadUnaligned<Vector512<byte>>(ref blockRef); |
|
|
|
Vector512<byte> rgba = Vector512_.ShuffleNative(bgra, shuffleMask); |
|
|
|
Vector512<int> packed = rgba.AsInt32(); |
|
|
|
Vector512<int> alpha = Vector512.ShiftRightLogical(packed, 24); |
|
|
|
|
|
|
|
// Fully transparent and fully opaque pixels are identity cases for
|
|
|
|
// unpremultiplication. Masking them keeps the scalar behavior and lets
|
|
|
|
// safeAlpha avoid dividing by zero for alpha == 0.
|
|
|
|
Vector512<int> partialMask = ~(Vector512.Equals(alpha, zero) | Vector512.Equals(alpha, opaque)); |
|
|
|
|
|
|
|
Vector512<int> r = packed & byteMask; |
|
|
|
Vector512<int> g = Vector512.ShiftRightLogical(packed, 8) & byteMask; |
|
|
|
Vector512<int> b = Vector512.ShiftRightLogical(packed, 16) & byteMask; |
|
|
|
|
|
|
|
Vector512<int> safeAlpha = Vector512.ConditionalSelect(partialMask, alpha, one); |
|
|
|
Vector512<int> halfAlpha = Vector512.ShiftRightLogical(safeAlpha, 1); |
|
|
|
Vector512<float> safeAlphaF = Vector512.ConvertToSingle(safeAlpha); |
|
|
|
|
|
|
|
// The scalar path computes ((c * 255) + (a >> 1)) / a with integer
|
|
|
|
// division. Floor the positive quotient before converting so SIMD does
|
|
|
|
// not use the default round-to-nearest conversion and drift by one.
|
|
|
|
Vector512<int> unpremultipliedR = Vector512.Min( |
|
|
|
byteMax, |
|
|
|
Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF))); |
|
|
|
|
|
|
|
Vector512<int> unpremultipliedG = Vector512.Min( |
|
|
|
byteMax, |
|
|
|
Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF))); |
|
|
|
|
|
|
|
Vector512<int> unpremultipliedB = Vector512.Min( |
|
|
|
byteMax, |
|
|
|
Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF))); |
|
|
|
|
|
|
|
// ConditionalSelect applies the expensive unpremultiply only to pixels
|
|
|
|
// where alpha is between 1 and 254; alpha 0 and 255 lanes keep the
|
|
|
|
// shuffled channel values exactly as the scalar path does.
|
|
|
|
Vector512<int> finalR = Vector512.ConditionalSelect(partialMask, unpremultipliedR, r); |
|
|
|
Vector512<int> finalG = Vector512.ConditionalSelect(partialMask, unpremultipliedG, g); |
|
|
|
Vector512<int> finalB = Vector512.ConditionalSelect(partialMask, unpremultipliedB, b); |
|
|
|
|
|
|
|
// Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so
|
|
|
|
// shifting the unpacked channels back to byte offsets 0, 1, 2, and 3
|
|
|
|
// recreates the in-memory RGBA bytes for the unaligned store.
|
|
|
|
Vector512<int> result = |
|
|
|
finalR | |
|
|
|
Vector512.ShiftLeft(finalG, 8) | |
|
|
|
Vector512.ShiftLeft(finalB, 16) | |
|
|
|
Vector512.ShiftLeft(alpha, 24); |
|
|
|
|
|
|
|
Unsafe.WriteUnaligned(ref blockRef, result.AsByte()); |
|
|
|
} |
|
|
|
|
|
|
|
return i; |
|
|
|
} |
|
|
|
|
|
|
|
private static int ApplyCgbiTransformVector256(Span<byte> scanline, int startPixel, int pixelCount) |
|
|
|
{ |
|
|
|
ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline); |
|
|
|
int i = startPixel; |
|
|
|
|
|
|
|
Span<byte> temp = stackalloc byte[Vector512<byte>.Count]; |
|
|
|
SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012); |
|
|
|
|
|
|
|
// MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting
|
|
|
|
// CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place.
|
|
|
|
// Avx2.Shuffle is 128-bit lane-local, and the generated mask repeats inside
|
|
|
|
// each lane, so no byte ever needs to cross the lane boundary.
|
|
|
|
Vector256<byte> shuffleMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(temp)); |
|
|
|
|
|
|
|
Vector256<int> zero = Vector256<int>.Zero; |
|
|
|
Vector256<int> one = Vector256<int>.One; |
|
|
|
Vector256<int> byteMask = Vector256.Create(0xFF); |
|
|
|
Vector256<int> opaque = Vector256.Create(0xFF); |
|
|
|
Vector256<int> byteMax = Vector256.Create((int)byte.MaxValue); |
|
|
|
|
|
|
|
for (; i <= pixelCount - 8; i += 8) |
|
|
|
{ |
|
|
|
ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf<Rgba32>()); |
|
|
|
Vector256<byte> bgra = Unsafe.ReadUnaligned<Vector256<byte>>(ref blockRef); |
|
|
|
Vector256<byte> rgba = Vector256_.ShufflePerLane(bgra, shuffleMask); |
|
|
|
Vector256<int> packed = rgba.AsInt32(); |
|
|
|
Vector256<int> alpha = Vector256.ShiftRightLogical(packed, 24); |
|
|
|
|
|
|
|
// Fully transparent and fully opaque pixels are identity cases for
|
|
|
|
// unpremultiplication. Masking them keeps the scalar behavior and lets
|
|
|
|
// safeAlpha avoid dividing by zero for alpha == 0.
|
|
|
|
Vector256<int> partialMask = ~(Vector256.Equals(alpha, zero) | Vector256.Equals(alpha, opaque)); |
|
|
|
|
|
|
|
Vector256<int> r = packed & byteMask; |
|
|
|
Vector256<int> g = Vector256.ShiftRightLogical(packed, 8) & byteMask; |
|
|
|
Vector256<int> b = Vector256.ShiftRightLogical(packed, 16) & byteMask; |
|
|
|
|
|
|
|
Vector256<int> safeAlpha = Vector256.ConditionalSelect(partialMask, alpha, one); |
|
|
|
Vector256<int> halfAlpha = Vector256.ShiftRightLogical(safeAlpha, 1); |
|
|
|
Vector256<float> safeAlphaF = Vector256.ConvertToSingle(safeAlpha); |
|
|
|
|
|
|
|
// The scalar path computes ((c * 255) + (a >> 1)) / a with integer
|
|
|
|
// division. Floor the positive quotient before converting so SIMD does
|
|
|
|
// not use the default round-to-nearest conversion and drift by one.
|
|
|
|
Vector256<int> unpremultipliedR = Vector256.Min( |
|
|
|
byteMax, |
|
|
|
Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF))); |
|
|
|
|
|
|
|
Vector256<int> unpremultipliedG = Vector256.Min( |
|
|
|
byteMax, |
|
|
|
Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF))); |
|
|
|
|
|
|
|
Vector256<int> unpremultipliedB = Vector256.Min( |
|
|
|
byteMax, |
|
|
|
Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF))); |
|
|
|
|
|
|
|
// ConditionalSelect applies the expensive unpremultiply only to pixels
|
|
|
|
// where alpha is between 1 and 254; alpha 0 and 255 lanes keep the
|
|
|
|
// shuffled channel values exactly as the scalar path does.
|
|
|
|
Vector256<int> finalR = Vector256.ConditionalSelect(partialMask, unpremultipliedR, r); |
|
|
|
Vector256<int> finalG = Vector256.ConditionalSelect(partialMask, unpremultipliedG, g); |
|
|
|
Vector256<int> finalB = Vector256.ConditionalSelect(partialMask, unpremultipliedB, b); |
|
|
|
|
|
|
|
// Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so
|
|
|
|
// shifting the unpacked channels back to byte offsets 0, 1, 2, and 3
|
|
|
|
// recreates the in-memory RGBA bytes for the unaligned store.
|
|
|
|
Vector256<int> result = |
|
|
|
finalR | |
|
|
|
Vector256.ShiftLeft(finalG, 8) | |
|
|
|
Vector256.ShiftLeft(finalB, 16) | |
|
|
|
Vector256.ShiftLeft(alpha, 24); |
|
|
|
|
|
|
|
Unsafe.WriteUnaligned(ref blockRef, result.AsByte()); |
|
|
|
} |
|
|
|
|
|
|
|
return i; |
|
|
|
} |
|
|
|
|
|
|
|
private static int ApplyCgbiTransformVector128(Span<byte> scanline, int startPixel, int pixelCount) |
|
|
|
{ |
|
|
|
ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline); |
|
|
|
int i = startPixel; |
|
|
|
|
|
|
|
Span<byte> temp = stackalloc byte[Vector512<byte>.Count]; |
|
|
|
SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012); |
|
|
|
|
|
|
|
// MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting
|
|
|
|
// CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place.
|
|
|
|
Vector128<byte> shuffleMask = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(temp)); |
|
|
|
|
|
|
|
Vector128<int> zero = Vector128<int>.Zero; |
|
|
|
Vector128<int> one = Vector128<int>.One; |
|
|
|
Vector128<int> byteMask = Vector128.Create(0xFF); |
|
|
|
Vector128<int> opaque = Vector128.Create(0xFF); |
|
|
|
Vector128<int> byteMax = Vector128.Create((int)byte.MaxValue); |
|
|
|
|
|
|
|
for (; i <= pixelCount - 4; i += 4) |
|
|
|
{ |
|
|
|
ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf<Rgba32>()); |
|
|
|
Vector128<byte> bgra = Unsafe.ReadUnaligned<Vector128<byte>>(ref blockRef); |
|
|
|
Vector128<byte> rgba = Vector128_.ShuffleNative(bgra, shuffleMask); |
|
|
|
Vector128<int> packed = rgba.AsInt32(); |
|
|
|
Vector128<int> alpha = Vector128.ShiftRightLogical(packed, 24); |
|
|
|
|
|
|
|
// Fully transparent and fully opaque pixels are identity cases for
|
|
|
|
// unpremultiplication. Masking them keeps the scalar behavior and lets
|
|
|
|
// safeAlpha avoid dividing by zero for alpha == 0.
|
|
|
|
Vector128<int> partialMask = ~(Vector128.Equals(alpha, zero) | Vector128.Equals(alpha, opaque)); |
|
|
|
|
|
|
|
Vector128<int> r = packed & byteMask; |
|
|
|
Vector128<int> g = Vector128.ShiftRightLogical(packed, 8) & byteMask; |
|
|
|
Vector128<int> b = Vector128.ShiftRightLogical(packed, 16) & byteMask; |
|
|
|
|
|
|
|
Vector128<int> safeAlpha = Vector128.ConditionalSelect(partialMask, alpha, one); |
|
|
|
Vector128<int> halfAlpha = Vector128.ShiftRightLogical(safeAlpha, 1); |
|
|
|
Vector128<float> safeAlphaF = Vector128.ConvertToSingle(safeAlpha); |
|
|
|
|
|
|
|
// The scalar path computes ((c * 255) + (a >> 1)) / a with integer
|
|
|
|
// division. Floor the positive quotient before converting so SIMD does
|
|
|
|
// not use the default round-to-nearest conversion and drift by one.
|
|
|
|
Vector128<int> unpremultipliedR = Vector128.Min( |
|
|
|
byteMax, |
|
|
|
Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF))); |
|
|
|
|
|
|
|
Vector128<int> unpremultipliedG = Vector128.Min( |
|
|
|
byteMax, |
|
|
|
Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF))); |
|
|
|
|
|
|
|
Vector128<int> unpremultipliedB = Vector128.Min( |
|
|
|
byteMax, |
|
|
|
Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF))); |
|
|
|
|
|
|
|
// ConditionalSelect applies the expensive unpremultiply only to pixels
|
|
|
|
// where alpha is between 1 and 254; alpha 0 and 255 lanes keep the
|
|
|
|
// shuffled channel values exactly as the scalar path does.
|
|
|
|
Vector128<int> finalR = Vector128.ConditionalSelect(partialMask, unpremultipliedR, r); |
|
|
|
Vector128<int> finalG = Vector128.ConditionalSelect(partialMask, unpremultipliedG, g); |
|
|
|
Vector128<int> finalB = Vector128.ConditionalSelect(partialMask, unpremultipliedB, b); |
|
|
|
|
|
|
|
// Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so
|
|
|
|
// shifting the unpacked channels back to byte offsets 0, 1, 2, and 3
|
|
|
|
// recreates the in-memory RGBA bytes for the unaligned store.
|
|
|
|
Vector128<int> result = |
|
|
|
finalR | |
|
|
|
Vector128.ShiftLeft(finalG, 8) | |
|
|
|
Vector128.ShiftLeft(finalB, 16) | |
|
|
|
Vector128.ShiftLeft(alpha, 24); |
|
|
|
|
|
|
|
Unsafe.WriteUnaligned(ref blockRef, result.AsByte()); |
|
|
|
} |
|
|
|
|
|
|
|
return i; |
|
|
|
} |
|
|
|
} |
|
|
|
|