diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index ff5ce957e..f6e55153a 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -123,8 +123,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void Encode420(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - Span temporalBlocks = stackalloc Block8x8F[2]; - var unzig = ZigZag.CreateUnzigTable(); var pixelConverter = YCbCrForwardConverter.Create(); @@ -140,18 +138,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder cancellationToken.ThrowIfCancellationRequested(); for (int x = 0; x < pixels.Width; x += 16) { - for (int i = 0; i < 4; i++) + for(int i = 0; i < 2; i++) { - int xOff = (i & 1) * 8; - int yOff = (i & 2) * 4; - + int yOff = i * 8; currentRows.Update(pixelBuffer, y + yOff); - pixelConverter.Convert420(frame, x + xOff, y + yOff, ref currentRows, i); + pixelConverter.Convert420(frame, x, y, ref currentRows, i); + + prevDCY = this.WriteBlock( + QuantIndex.Luminance, + prevDCY, + ref pixelConverter.twinBlocksY[0], + ref luminanceQuantTable, + ref unzig); prevDCY = this.WriteBlock( QuantIndex.Luminance, prevDCY, - ref pixelConverter.Y, + ref pixelConverter.twinBlocksY[1], ref luminanceQuantTable, ref unzig); } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs index 055c7176a..a44b174d8 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -204,14 +204,96 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder #endif } + /// + /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling + /// + /// + /// + /// + /// + /// + /// + public static void Convert420_16x8(ReadOnlySpan rgbSpan, Span yBlocks, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row) + { + Debug.Assert(IsSupported, "AVX2 is required to run this converter"); + +#if SUPPORTS_RUNTIME_INTRINSICS + var f0299 = Vector256.Create(0.299f); + var f0587 = Vector256.Create(0.587f); + var f0114 = Vector256.Create(0.114f); + var fn0168736 = Vector256.Create(-0.168736f); + var fn0331264 = Vector256.Create(-0.331264f); + var f128 = Vector256.Create(128f); + var fn0418688 = Vector256.Create(-0.418688f); + var fn0081312F = Vector256.Create(-0.081312F); + var f05 = Vector256.Create(0.5f); + var zero = Vector256.Create(0).AsByte(); + + ref Vector256 rgbByteSpan = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); + + int destOffset = row * 4; + + ref Vector256 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), destOffset); + ref Vector256 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), destOffset); + + var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); + var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); + Vector256 rgb, rg, bx; + Vector256 r, g, b; + + Span> rDataLanes = stackalloc Vector256[4]; + Span> gDataLanes = stackalloc Vector256[4]; + Span> bDataLanes = stackalloc Vector256[4]; + + const int bytesPerRgbStride = 24; + for (int i = 0; i < 4; i++) + { + // 16x2 => 8x1 + for (int j = 0; j < 4; j++) + { + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte(); + + rgb = Avx2.Shuffle(rgb, extractRgbMask); + + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); + + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); + + int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref yBlocks[j & 1].V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); + + rDataLanes[j] = r; + gDataLanes[j] = g; + bDataLanes[j] = b; + } + + r = Scale_8x4_4x2(rDataLanes); + g = Scale_8x4_4x2(gDataLanes); + b = Scale_8x4_4x2(bDataLanes); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); + } +#endif + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 Scale_8x4_4x2(Span> v) { Vector256 switchInnerDoubleWords = Unsafe.As>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32)); var f025 = Vector256.Create(0.25f); - Vector256 topPairSum = SumHorizontalPairs(v[0], v[1]); - Vector256 botPairSum = SumHorizontalPairs(v[2], v[3]); + Vector256 topPairSum = SumHorizontalPairs(v[0], v[2]); + Vector256 botPairSum = SumHorizontalPairs(v[1], v[3]); return Avx2.PermuteVar8x32(Avx.Multiply(SumVerticalPairs(topPairSum, botPairSum), f025), switchInnerDoubleWords); } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs index 952dde111..120b21e10 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs @@ -46,14 +46,20 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder /// private Span rgbSpan; + public Span twinBlocksY; + public static YCbCrForwardConverter Create() { var result = default(YCbCrForwardConverter); // creating rgb pixel bufferr // TODO: this is subject to discuss - result.rgbSpan = MemoryMarshal.Cast(new byte[200].AsSpan()); - result.pixelSpan = new TPixel[64].AsSpan(); + const int twoBlocksByteSizeWithPadding = 384 + 8; // converter.Convert comments for +8 padding + result.rgbSpan = MemoryMarshal.Cast(new byte[twoBlocksByteSizeWithPadding].AsSpan()); + // TODO: this size should be configurable + result.pixelSpan = new TPixel[128].AsSpan(); + + result.twinBlocksY = new Block8x8F[2].AsSpan(); // Avoid creating lookup tables, when vectorized converter is supported if (!RgbToYCbCrConverterVectorized.IsSupported) @@ -70,7 +76,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void Convert(ImageFrame frame, int x, int y, ref RowOctet currentRows) { Memory.Buffer2D buffer = frame.PixelBuffer; - LoadAndStretchEdges(currentRows, this.pixelSpan, x, buffer.Width - x, buffer.Height - y); + LoadAndStretchEdges(currentRows, this.pixelSpan, x, buffer.Width - x, buffer.Height - y, new Size(8)); PixelOperations.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan); @@ -94,13 +100,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder public void Convert420(ImageFrame frame, int x, int y, ref RowOctet currentRows, int idx) { Memory.Buffer2D buffer = frame.PixelBuffer; - LoadAndStretchEdges(currentRows, this.pixelSpan, x, Math.Min(8, buffer.Width - x), Math.Min(8, buffer.Height - y)); + LoadAndStretchEdges(currentRows, this.pixelSpan, x, Math.Min(16, buffer.Width - x), Math.Min(8, buffer.Height - y), new Size(16, 8)); PixelOperations.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan); if (RgbToYCbCrConverterVectorized.IsSupported) { - RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.Y, ref this.Cb, ref this.Cr, idx); + RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, this.twinBlocksY, ref this.Cb, ref this.Cr, idx); } else { @@ -110,7 +116,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } // TODO: add DebugGuard checks? - private static void LoadAndStretchEdges(RowOctet source, Span dest, int startX, int width, int height) + private static void LoadAndStretchEdges(RowOctet source, Span dest, int startX, int width, int height, Size areaSize) { //Guard.MustBeBetweenOrEqualTo(width, 1, 8, nameof(width)); //Guard.MustBeBetweenOrEqualTo(height, 1, 8, nameof(width)); @@ -122,10 +128,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } uint byteWidth = (uint)(width * Unsafe.SizeOf()); - int remainderXCount = 8 - width; + int remainderXCount = areaSize.Width - width; ref byte blockStart = ref MemoryMarshal.GetReference(MemoryMarshal.Cast(dest)); - int rowSizeInBytes = 8 * Unsafe.SizeOf(); + int rowSizeInBytes = areaSize.Width * Unsafe.SizeOf(); for (int y = 0; y < height; y++) { @@ -144,7 +150,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } } - int remainderYCount = 8 - height; + int remainderYCount = areaSize.Height - height; if (remainderYCount == 0) {