diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 0320229a2b..218b2b59c3 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -71,11 +71,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // ReSharper disable once InconsistentNaming int prevDCY = 0, prevDCCb = 0, prevDCCr = 0; - var pixelConverter = YCbCrForwardConverter.Create(); ImageFrame frame = pixels.Frames.RootFrame; Buffer2D pixelBuffer = frame.PixelBuffer; RowOctet currentRows = default; + var pixelConverter = new YCbCrForwardConverter444(frame); + for (int y = 0; y < pixels.Height; y += 8) { cancellationToken.ThrowIfCancellationRequested(); @@ -83,7 +84,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder for (int x = 0; x < pixels.Width; x += 8) { - pixelConverter.Convert(frame, x, y, ref currentRows); + pixelConverter.Convert(x, y, ref currentRows); prevDCY = this.WriteBlock( QuantIndex.Luminance, @@ -123,57 +124,53 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void Encode420(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - Block8x8F b = default; - Span cb = stackalloc Block8x8F[4]; - Span cr = stackalloc Block8x8F[4]; - var unzig = ZigZag.CreateUnzigTable(); - var pixelConverter = YCbCrForwardConverter.Create(); - // ReSharper disable once InconsistentNaming int prevDCY = 0, prevDCCb = 0, prevDCCr = 0; ImageFrame frame = pixels.Frames.RootFrame; Buffer2D pixelBuffer = frame.PixelBuffer; RowOctet currentRows = default; + var pixelConverter = new YCbCrForwardConverter420(frame); + for (int y = 0; y < pixels.Height; y += 16) { cancellationToken.ThrowIfCancellationRequested(); for (int x = 0; x < pixels.Width; x += 16) { - for (int i = 0; i < 4; i++) + for (int i = 0; i < 2; i++) { - int xOff = (i & 1) * 8; - int yOff = (i & 2) * 4; - + int yOff = i * 8; currentRows.Update(pixelBuffer, y + yOff); - pixelConverter.Convert(frame, x + xOff, y + yOff, ref currentRows); + pixelConverter.Convert(x, y, ref currentRows, i); - cb[i] = pixelConverter.Cb; - cr[i] = pixelConverter.Cr; + prevDCY = this.WriteBlock( + QuantIndex.Luminance, + prevDCY, + ref pixelConverter.YLeft, + ref luminanceQuantTable, + ref unzig); prevDCY = this.WriteBlock( QuantIndex.Luminance, prevDCY, - ref pixelConverter.Y, + ref pixelConverter.YRight, ref luminanceQuantTable, ref unzig); } - Block8x8F.Scale16X16To8X8(ref b, cb); prevDCCb = this.WriteBlock( QuantIndex.Chrominance, prevDCCb, - ref b, + ref pixelConverter.Cb, ref chrominanceQuantTable, ref unzig); - Block8x8F.Scale16X16To8X8(ref b, cr); prevDCCr = this.WriteBlock( QuantIndex.Chrominance, prevDCCr, - ref b, + ref pixelConverter.Cr, ref chrominanceQuantTable, ref unzig); } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs index 1ceea1e08a..7681063eef 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs @@ -115,7 +115,41 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits; } - public void Convert(Span rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ConvertPixelInto( + int r, + int g, + int b, + ref Block8x8F yResult, + int i) + { + // float y = (0.299F * r) + (0.587F * g) + (0.114F * b); + yResult[i] = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ConvertPixelInto(int r, int g, int b, ref float yResult) => + // float y = (0.299F * r) + (0.587F * g) + (0.114F * b); + yResult = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ConvertPixelInto(int r, int g, int b, ref float cbResult, ref float crResult) + { + // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); + cbResult = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits; + + // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); + crResult = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits; + } + + /// + /// Converts Rgb24 pixels into YCbCr color space with 4:4:4 subsampling sampling of luminance and chroma. + /// + /// Span of Rgb24 pixel data + /// Resulting Y values block + /// Resulting Cb values block + /// Resulting Cr values block + public void Convert444(Span rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) { ref Rgb24 rgbStart = ref rgbSpan[0]; @@ -134,6 +168,84 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } } + /// + /// Converts Rgb24 pixels into YCbCr color space with 4:2:0 subsampling of luminance and chroma. + /// + /// Calculates 2 out of 4 luminance blocks and half of chroma blocks. This method must be called twice per 4x 8x8 DCT blocks with different row param. + /// Span of Rgb24 pixel data + /// First or "left" resulting Y block + /// Second or "right" resulting Y block + /// Resulting Cb values block + /// Resulting Cr values block + /// Row index of the 16x16 block, 0 or 1 + public void Convert420(Span rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row) + { + DebugGuard.MustBeBetweenOrEqualTo(row, 0, 1, nameof(row)); + + ref float yBlockLeftRef = ref Unsafe.As(ref yBlockLeft); + ref float yBlockRightRef = ref Unsafe.As(ref yBlockRight); + + // 0-31 or 32-63 + // upper or lower part + int chromaWriteOffset = row * Block8x8F.Size / 2; + ref float cbBlockRef = ref Unsafe.Add(ref Unsafe.As(ref cbBlock), chromaWriteOffset); + ref float crBlockRef = ref Unsafe.Add(ref Unsafe.As(ref crBlock), chromaWriteOffset); + + ref Rgb24 rgbStart = ref rgbSpan[0]; + + for (int i = 0; i < 8; i += 2) + { + // 8 pixels by 3 integers + Span rgbTriplets = stackalloc int[24]; + + for (int j = 0; j < 2; j++) + { + int yBlockWriteOffset = (i + j) * 8; + ref Rgb24 stride = ref Unsafe.Add(ref rgbStart, (i + j) * 16); + + // left + this.ConvertChunk420(ref stride, ref Unsafe.Add(ref yBlockLeftRef, yBlockWriteOffset), rgbTriplets); + + // right + this.ConvertChunk420(ref Unsafe.Add(ref stride, 8), ref Unsafe.Add(ref yBlockRightRef, yBlockWriteOffset), rgbTriplets.Slice(12)); + } + + int writeIdx = 8 * (i / 2); + ref float cbWriteRef = ref Unsafe.Add(ref cbBlockRef, writeIdx); + ref float crWriteRef = ref Unsafe.Add(ref crBlockRef, writeIdx); + for (int j = 0; j < 8; j++) + { + int idx = j * 3; + this.ConvertPixelInto( + rgbTriplets[idx] / 4, // r + rgbTriplets[idx + 1] / 4, // g + rgbTriplets[idx + 2] / 4, // b + ref Unsafe.Add(ref cbWriteRef, j), + ref Unsafe.Add(ref crWriteRef, j)); + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ConvertChunk420(ref Rgb24 stride, ref float yBlock, Span chromaRgbTriplet) + { + for (int k = 0; k < 8; k += 2) + { + ref float yBlockRef = ref Unsafe.Add(ref yBlock, k); + + Rgb24 px0 = Unsafe.Add(ref stride, k); + Rgb24 px1 = Unsafe.Add(ref stride, k + 1); + + this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef); + this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1)); + + int idx = 3 * (k / 2); + chromaRgbTriplet[idx] += px0.R + px1.R; + chromaRgbTriplet[idx + 1] += px0.G + px1.G; + chromaRgbTriplet[idx + 2] += px0.B + px1.B; + } + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Fix(float x) => (int)((x * (1L << ScaleBits)) + 0.5F); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs index 3ee1ca9890..b9f0fa4271 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -27,19 +27,27 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } } + public static int AvxRegisterRgbCompatibilityPadding + { + get + { + if (IsSupported) + { + return 8; + } + + return 0; + } + } + #if SUPPORTS_RUNTIME_INTRINSICS + private static ReadOnlySpan MoveFirst24BytesToSeparateLanes => new byte[] { 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; - private static ReadOnlySpan MoveLast24BytesToSeparateLanes => new byte[] - { - 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, - 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0 - }; - private static ReadOnlySpan ExtractRgb => new byte[] { 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF, @@ -47,6 +55,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder }; #endif + /// + /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:4:4 subsampling + /// + /// Total size of rgb span must be 200 bytes + /// Span of rgb pixels with size of 64 + /// 8x8 destination matrix of Luminance(Y) converted data + /// 8x8 destination matrix of Chrominance(Cb) converted data + /// 8x8 destination matrix of Chrominance(Cr) converted data public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) { Debug.Assert(IsSupported, "AVX2 is required to run this converter"); @@ -63,7 +79,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder var f05 = Vector256.Create(0.5f); var zero = Vector256.Create(0).AsByte(); - ref Vector256 inRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); + ref Vector256 rgbByteSpan = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); ref Vector256 destYRef = ref yBlock.V0; ref Vector256 destCbRef = ref cbBlock.V0; ref Vector256 destCrRef = ref crBlock.V0; @@ -72,9 +88,31 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); Vector256 rgb, rg, bx; Vector256 r, g, b; - for (int i = 0; i < 7; i++) + + // TODO: probably remove this after the draft + // rgbByteSpan contains 8 strides by 8 pixels each, thus 64 pixels total + // Strides are stored sequentially - one big span of 64 * 3 = 192 bytes + // Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits + // Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride: + // stride 0 0 - 192 -(+64bits)-> 256 + // stride 1 192 - 384 -(+64bits)-> 448 + // stride 2 384 - 576 -(+64bits)-> 640 + // stride 3 576 - 768 -(+64bits)-> 832 + // stride 4 768 - 960 -(+64bits)-> 1024 + // stride 5 960 - 1152 -(+64bits)-> 1216 + // stride 6 1152 - 1344 -(+64bits)-> 1408 + // stride 7 1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION + // + // Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits + // This is not permitted - we are reading foreign memory + // That's why last stride is calculated outside of the for-loop loop with special extract shuffle mask involved + // + // Extra mask & separate stride:7 calculations can be eliminated by simply providing rgb pixel span of slightly bigger size than pixels data need: + // Total pixel data size is 192 bytes, avx registers need it to be 200 bytes + const int bytesPerRgbStride = 24; + for (int i = 0; i < 8; i++) { - rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte(); + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * i)).AsUInt32(), extractToLanesMask).AsByte(); rgb = Avx2.Shuffle(rgb, extractRgbMask); @@ -94,27 +132,241 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); } +#endif + } + + /// + /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:2:0 subsampling + /// + /// Total size of rgb span must be 200 bytes + public static void Convert420(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock, int idx) + { + Debug.Assert(IsSupported, "AVX2 is required to run this converter"); + +#if SUPPORTS_RUNTIME_INTRINSICS + var f0299 = Vector256.Create(0.299f); + var f0587 = Vector256.Create(0.587f); + var f0114 = Vector256.Create(0.114f); + var fn0168736 = Vector256.Create(-0.168736f); + var fn0331264 = Vector256.Create(-0.331264f); + var f128 = Vector256.Create(128f); + var fn0418688 = Vector256.Create(-0.418688f); + var fn0081312F = Vector256.Create(-0.081312F); + var f05 = Vector256.Create(0.5f); + var zero = Vector256.Create(0).AsByte(); + + ref Vector256 rgbByteSpan = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); + ref Vector256 destYRef = ref yBlock.V0; + + int destOffset = (idx & 2) * 4 + (idx & 1); - extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes)); - rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte(); - rgb = Avx2.Shuffle(rgb, extractRgbMask); + ref Vector128 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), destOffset); + ref Vector128 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), destOffset); - rg = Avx2.UnpackLow(rgb, zero); - bx = Avx2.UnpackHigh(rgb, zero); + var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); + var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); + Vector256 rgb, rg, bx; + Vector256 r, g, b; + + Span> rDataLanes = stackalloc Vector256[4]; + Span> gDataLanes = stackalloc Vector256[4]; + Span> bDataLanes = stackalloc Vector256[4]; + + const int bytesPerRgbStride = 24; + for (int i = 0; i < 2; i++) + { + // each 4 lanes - [0, 1, 2, 3] & [4, 5, 6, 7] + for (int j = 0; j < 4; j++) + { + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte(); + + rgb = Avx2.Shuffle(rgb, extractRgbMask); - r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); - g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); - b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); - // (0.299F * r) + (0.587F * g) + (0.114F * b); - Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); - // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) - Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref destYRef, i * 4 + j) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); - // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) - Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); + rDataLanes[j] = r; + gDataLanes[j] = g; + bDataLanes[j] = b; + } + + int localDestOffset = (i & 1) * 4; + + r = Scale_8x4_4x2(rDataLanes); + g = Scale_8x4_4x2(gDataLanes); + b = Scale_8x4_4x2(bDataLanes); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Vector256 cb = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); + Unsafe.Add(ref destCbRef, localDestOffset) = cb.GetLower(); + Unsafe.Add(ref destCbRef, localDestOffset + 2) = cb.GetUpper(); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Vector256 cr = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); + Unsafe.Add(ref destCrRef, localDestOffset) = cr.GetLower(); + Unsafe.Add(ref destCrRef, localDestOffset + 2) = cr.GetUpper(); + } #endif } + + /// + /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling + /// + public static void Convert420_16x8(ReadOnlySpan rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row) + { + Debug.Assert(IsSupported, "AVX2 is required to run this converter"); + +#if SUPPORTS_RUNTIME_INTRINSICS + var f0299 = Vector256.Create(0.299f); + var f0587 = Vector256.Create(0.587f); + var f0114 = Vector256.Create(0.114f); + var fn0168736 = Vector256.Create(-0.168736f); + var fn0331264 = Vector256.Create(-0.331264f); + var f128 = Vector256.Create(128f); + var fn0418688 = Vector256.Create(-0.418688f); + var fn0081312F = Vector256.Create(-0.081312F); + var f05 = Vector256.Create(0.5f); + var zero = Vector256.Create(0).AsByte(); + + ref Vector256 rgbByteSpan = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); + + int destOffset = row * 4; + + ref Vector256 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), destOffset); + ref Vector256 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), destOffset); + + var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); + var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); + Vector256 rgb, rg, bx; + Vector256 r, g, b; + + Span> rDataLanes = stackalloc Vector256[4]; + Span> gDataLanes = stackalloc Vector256[4]; + Span> bDataLanes = stackalloc Vector256[4]; + + const int bytesPerRgbStride = 24; + for (int i = 0; i < 4; i++) + { + // 16x2 => 8x1 + // left 8x8 column conversions + for (int j = 0; j < 4; j += 2) + { + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte(); + + rgb = Avx2.Shuffle(rgb, extractRgbMask); + + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); + + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); + + int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref yBlockLeft.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); + + rDataLanes[j] = r; + gDataLanes[j] = g; + bDataLanes[j] = b; + } + + // 16x2 => 8x1 + // right 8x8 column conversions + for (int j = 1; j < 4; j += 2) + { + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte(); + + rgb = Avx2.Shuffle(rgb, extractRgbMask); + + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); + + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); + + int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref yBlockRight.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); + + rDataLanes[j] = r; + gDataLanes[j] = g; + bDataLanes[j] = b; + } + + r = Scale_8x4_4x2(rDataLanes); + g = Scale_8x4_4x2(gDataLanes); + b = Scale_8x4_4x2(bDataLanes); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); + } +#endif + } + +#if SUPPORTS_RUNTIME_INTRINSICS + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 Scale_8x4_4x2(Span> v) + { + Vector256 switchInnerDoubleWords = Unsafe.As>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32)); + var f025 = Vector256.Create(0.25f); + + Vector256 topPairSum = SumHorizontalPairs(v[0], v[2]); + Vector256 botPairSum = SumHorizontalPairs(v[1], v[3]); + + return Avx2.PermuteVar8x32(Avx.Multiply(SumVerticalPairs(topPairSum, botPairSum), f025), switchInnerDoubleWords); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 SumHorizontalPairs(Vector256 v0, Vector256 v1) + => Avx.Add(Avx.Shuffle(v0, v1, 0b10_00_10_00), Avx.Shuffle(v0, v1, 0b11_01_11_01)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 SumVerticalPairs(Vector256 v0, Vector256 v1) + => Avx.Add(Avx.Shuffle(v0, v1, 0b01_00_01_00), Avx.Shuffle(v0, v1, 0b11_10_11_10)); + + public static void ConvertCbCr(ref Block8x8F rBlock, ref Block8x8F gBlock, ref Block8x8F bBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + { + var fn0168736 = Vector256.Create(-0.168736f); + var fn0331264 = Vector256.Create(-0.331264f); + var f128 = Vector256.Create(128f); + var fn0418688 = Vector256.Create(-0.418688f); + var fn0081312F = Vector256.Create(-0.081312F); + var f05 = Vector256.Create(0.5f); + + ref Vector256 destCbRef = ref cbBlock.V0; + ref Vector256 destCrRef = ref crBlock.V0; + + ref Vector256 rRef = ref rBlock.V0; + ref Vector256 gRef = ref gBlock.V0; + ref Vector256 bRef = ref bBlock.V0; + + for (int i = 0; i < 8; i++) + { + ref Vector256 r = ref Unsafe.Add(ref rRef, i); + ref Vector256 g = ref Unsafe.Add(ref gRef, i); + ref Vector256 b = ref Unsafe.Add(ref bRef, i); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); + } + } +#endif } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs new file mode 100644 index 0000000000..e0e7854b0d --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs @@ -0,0 +1,113 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using SixLabors.ImageSharp.Advanced; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder +{ + /// + /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks. + /// + /// The pixel type to work on + internal ref struct YCbCrForwardConverter420 + where TPixel : unmanaged, IPixel + { + // TODO: docs + private const int PixelsPerSample = 16 * 8; + + // TODO: docs + private static int RgbSpanByteSize = PixelsPerSample * 3; + + // TODO: docs + private static readonly Size SampleSize = new Size(16, 8); + + /// + /// The left Y component + /// + public Block8x8F YLeft; + + /// + /// The left Y component + /// + public Block8x8F YRight; + + /// + /// The Cb component + /// + public Block8x8F Cb; + + /// + /// The Cr component + /// + public Block8x8F Cr; + + /// + /// The color conversion tables + /// + private RgbToYCbCrConverterLut colorTables; + + /// + /// Temporal 16x8 block to hold TPixel data + /// + private Span pixelSpan; + + /// + /// Temporal RGB block + /// + private Span rgbSpan; + + // TODO: docs + private Size samplingAreaSize; + + // TODO: docs + private Configuration config; + + + public YCbCrForwardConverter420(ImageFrame frame) + { + // matrices would be filled during convert calls + this.YLeft = default; + this.YRight = default; + this.Cb = default; + this.Cr = default; + + // temporal pixel buffers + this.pixelSpan = new TPixel[PixelsPerSample].AsSpan(); + this.rgbSpan = MemoryMarshal.Cast(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan()); + + // frame data + this.samplingAreaSize = new Size(frame.Width, frame.Height); + this.config = frame.GetConfiguration(); + + // conversion vector fallback data + if (!RgbToYCbCrConverterVectorized.IsSupported) + { + this.colorTables = RgbToYCbCrConverterLut.Create(); + } + else + { + this.colorTables = default; + } + } + + public void Convert(int x, int y, ref RowOctet currentRows, int idx) + { + YCbCrForwardConverter.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize); + + PixelOperations.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan); + + if (RgbToYCbCrConverterVectorized.IsSupported) + { + RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx); + } + else + { + this.colorTables.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx); + } + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs new file mode 100644 index 0000000000..f3ae339347 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs @@ -0,0 +1,135 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using SixLabors.ImageSharp.Advanced; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder +{ + /// + /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks. + /// + /// The pixel type to work on + internal ref struct YCbCrForwardConverter444 + where TPixel : unmanaged, IPixel + { + // TODO: docs + private const int PixelsPerSample = 8 * 8; + + // TODO: docs + private const int RgbSpanByteSize = PixelsPerSample * 3; + + // TODO: docs + private static readonly Size SampleSize = new Size(8, 8); + + + /// + /// The Y component + /// + public Block8x8F Y; + + /// + /// The Cb component + /// + public Block8x8F Cb; + + /// + /// The Cr component + /// + public Block8x8F Cr; + + /// + /// The color conversion tables + /// + private RgbToYCbCrConverterLut colorTables; + + /// + /// Temporal 64-byte span to hold unconverted TPixel data + /// + private Span pixelSpan; + + /// + /// Temporal 64-byte span to hold converted Rgb24 data + /// + private Span rgbSpan; + + // TODO: docs + private Size samplingAreaSize; + + // TODO: docs + private readonly Configuration config; + + public YCbCrForwardConverter444(ImageFrame frame) + { + // matrices would be filled during convert calls + this.Y = default; + this.Cb = default; + this.Cr = default; + + // temporal pixel buffers + this.pixelSpan = new TPixel[PixelsPerSample].AsSpan(); + this.rgbSpan = MemoryMarshal.Cast(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan()); + + // frame data + this.samplingAreaSize = new Size(frame.Width, frame.Height); + this.config = frame.GetConfiguration(); + + // conversion vector fallback data + if (!RgbToYCbCrConverterVectorized.IsSupported) + { + this.colorTables = RgbToYCbCrConverterLut.Create(); + } + else + { + this.colorTables = default; + } + } + + public static YCbCrForwardConverter444 Create() + { + var result = default(YCbCrForwardConverter444); + + // creating rgb pixel bufferr + // TODO: this is subject to discuss + // converter.Convert comments for +8 padding + result.rgbSpan = MemoryMarshal.Cast(new byte[RgbSpanByteSize + 8].AsSpan()); + + // TODO: this is subject to discuss + result.pixelSpan = new TPixel[PixelsPerSample].AsSpan(); + + // Avoid creating lookup tables, when vectorized converter is supported + if (!RgbToYCbCrConverterVectorized.IsSupported) + { + result.colorTables = RgbToYCbCrConverterLut.Create(); + } + + return result; + } + + /// + /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (, , ) + /// + public void Convert(int x, int y, ref RowOctet currentRows) + { + YCbCrForwardConverter.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize); + + PixelOperations.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan); + + ref Block8x8F yBlock = ref this.Y; + ref Block8x8F cbBlock = ref this.Cb; + ref Block8x8F crBlock = ref this.Cr; + + if (RgbToYCbCrConverterVectorized.IsSupported) + { + RgbToYCbCrConverterVectorized.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + } + else + { + this.colorTables.Convert444(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + } + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs index 81e64b277b..f5ef770914 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs @@ -2,81 +2,59 @@ // Licensed under the Apache License, Version 2.0. using System; -using SixLabors.ImageSharp.Advanced; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { - /// - /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks. - /// - /// The pixel type to work on - internal ref struct YCbCrForwardConverter + internal static class YCbCrForwardConverter where TPixel : unmanaged, IPixel { - /// - /// The Y component - /// - public Block8x8F Y; - - /// - /// The Cb component - /// - public Block8x8F Cb; - - /// - /// The Cr component - /// - public Block8x8F Cr; + public static void LoadAndStretchEdges(RowOctet source, Span dest, Point start, Size sampleSize, Size totalSize) + { + DebugGuard.MustBeBetweenOrEqualTo(start.X, 1, totalSize.Width - 1, nameof(start.X)); + DebugGuard.MustBeBetweenOrEqualTo(start.Y, 1, totalSize.Height - 1, nameof(start.Y)); - /// - /// The color conversion tables - /// - private RgbToYCbCrConverterLut colorTables; + int width = Math.Min(sampleSize.Width, totalSize.Width - start.X); + int height = Math.Min(sampleSize.Height, totalSize.Height - start.Y); - /// - /// Temporal 8x8 block to hold TPixel data - /// - private GenericBlock8x8 pixelBlock; + uint byteWidth = (uint)(width * Unsafe.SizeOf()); + int remainderXCount = sampleSize.Width - width; - /// - /// Temporal RGB block - /// - private GenericBlock8x8 rgbBlock; + ref byte blockStart = ref MemoryMarshal.GetReference(MemoryMarshal.Cast(dest)); + int rowSizeInBytes = sampleSize.Width * Unsafe.SizeOf(); - public static YCbCrForwardConverter Create() - { - var result = default(YCbCrForwardConverter); - if (!RgbToYCbCrConverterVectorized.IsSupported) + for (int y = 0; y < height; y++) { - // Avoid creating lookup tables, when vectorized converter is supported - result.colorTables = RgbToYCbCrConverterLut.Create(); - } + Span row = source[y]; - return result; - } + ref byte s = ref Unsafe.As(ref row[start.X]); + ref byte d = ref Unsafe.Add(ref blockStart, y * rowSizeInBytes); - /// - /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (, , ) - /// - public void Convert(ImageFrame frame, int x, int y, ref RowOctet currentRows) - { - this.pixelBlock.LoadAndStretchEdges(frame.PixelBuffer, x, y, ref currentRows); + Unsafe.CopyBlock(ref d, ref s, byteWidth); + + ref TPixel last = ref Unsafe.Add(ref Unsafe.As(ref d), width - 1); - Span rgbSpan = this.rgbBlock.AsSpanUnsafe(); - PixelOperations.Instance.ToRgb24(frame.GetConfiguration(), this.pixelBlock.AsSpanUnsafe(), rgbSpan); + for (int x = 1; x <= remainderXCount; x++) + { + Unsafe.Add(ref last, x) = last; + } + } - ref Block8x8F yBlock = ref this.Y; - ref Block8x8F cbBlock = ref this.Cb; - ref Block8x8F crBlock = ref this.Cr; + int remainderYCount = sampleSize.Height - height; - if (RgbToYCbCrConverterVectorized.IsSupported) + if (remainderYCount == 0) { - RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + return; } - else + + ref byte lastRowStart = ref Unsafe.Add(ref blockStart, (height - 1) * rowSizeInBytes); + + for (int y = 1; y <= remainderYCount; y++) { - this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + ref byte remStart = ref Unsafe.Add(ref lastRowStart, rowSizeInBytes * y); + Unsafe.CopyBlock(ref remStart, ref lastRowStart, (uint)rowSizeInBytes); } } } diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs index e22259f764..e807c416bd 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs @@ -13,9 +13,10 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg { public class EncodeJpeg { + [Params(50, 75, 95, 100)] + public int Quality; + private const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr; - // GDI+ most likely uses 75 as default quality - https://stackoverflow.com/questions/3957477/what-quality-level-does-image-save-use-for-jpeg-files - private const int EncodingQuality = 75; // GDI+ uses 4:2:0 subsampling private const JpegSubsample EncodingSubsampling = JpegSubsample.Ratio420; @@ -41,14 +42,14 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg this.bmpCore = Image.Load(this.bmpStream); this.bmpCore.Metadata.ExifProfile = null; - this.encoder = new JpegEncoder { Quality = EncodingQuality, Subsample = EncodingSubsampling }; + this.encoder = new JpegEncoder { Quality = Quality, Subsample = EncodingSubsampling }; this.bmpStream.Position = 0; this.bmpDrawing = SDImage.FromStream(this.bmpStream); this.jpegCodec = GetEncoder(ImageFormat.Jpeg); this.encoderParameters = new EncoderParameters(1); // Quality cast to long is necessary - this.encoderParameters.Param[0] = new EncoderParameter(Encoder.Quality, (long)EncodingQuality); + this.encoderParameters.Param[0] = new EncoderParameter(Encoder.Quality, (long)Quality); this.destinationStream = new MemoryStream(); } diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs index 1db4072932..60a5853847 100644 --- a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs +++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs @@ -37,7 +37,7 @@ namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder Block8x8F cb = default; Block8x8F cr = default; - this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr); + this.converter.Convert444(this.data.AsSpan(), ref y, ref cb, ref cr); } [Benchmark] diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs index 9a6fc8d6fd..c605a6cf89 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs @@ -32,7 +32,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg Block8x8F cb = default; Block8x8F cr = default; - target.Convert(data.AsSpan(), ref y, ref cb, ref cr); + target.Convert444(data.AsSpan(), ref y, ref cb, ref cr); Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F)); }