[WIP] Implemented 16x8 420 subsampling convertion

5 years ago · d50e255c85
3 changed files with 110 additions and 19 deletions
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@ -123,8 +123,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
-            Span<Block8x8F> temporalBlocks = stackalloc Block8x8F[2];
-
            var unzig = ZigZag.CreateUnzigTable();

            var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
@ -140,18 +138,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                cancellationToken.ThrowIfCancellationRequested();
                for (int x = 0; x < pixels.Width; x += 16)
                {
-                    for (int i = 0; i < 4; i++)
+                    for(int i = 0; i < 2; i++)
                    {
-                        int xOff = (i & 1) * 8;
-                        int yOff = (i & 2) * 4;
-
+                        int yOff = i * 8;
                        currentRows.Update(pixelBuffer, y + yOff);
-                        pixelConverter.Convert420(frame, x + xOff, y + yOff, ref currentRows, i);
+                        pixelConverter.Convert420(frame, x, y, ref currentRows, i);
+
+                        prevDCY = this.WriteBlock(
+                            QuantIndex.Luminance,
+                            prevDCY,
+                            ref pixelConverter.twinBlocksY[0],
+                            ref luminanceQuantTable,
+                            ref unzig);

                        prevDCY = this.WriteBlock(
                            QuantIndex.Luminance,
                            prevDCY,
-                            ref pixelConverter.Y,
+                            ref pixelConverter.twinBlocksY[1],
                            ref luminanceQuantTable,
                            ref unzig);
                    }
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@ -204,14 +204,96 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 #endif
        }

+        /// <summary>
+        /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
+        /// </summary>
+        /// <param name="rgbSpan"></param>
+        /// <param name="yBlock0"></param>
+        /// <param name="yBlock1"></param>
+        /// <param name="cbBlock"></param>
+        /// <param name="crBlock"></param>
+        /// <param name="row"></param>
+        public static void Convert420_16x8(ReadOnlySpan<Rgb24> rgbSpan, Span<Block8x8F> yBlocks, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
+        {
+            Debug.Assert(IsSupported, "AVX2 is required to run this converter");
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+            var f0299 = Vector256.Create(0.299f);
+            var f0587 = Vector256.Create(0.587f);
+            var f0114 = Vector256.Create(0.114f);
+            var fn0168736 = Vector256.Create(-0.168736f);
+            var fn0331264 = Vector256.Create(-0.331264f);
+            var f128 = Vector256.Create(128f);
+            var fn0418688 = Vector256.Create(-0.418688f);
+            var fn0081312F = Vector256.Create(-0.081312F);
+            var f05 = Vector256.Create(0.5f);
+            var zero = Vector256.Create(0).AsByte();
+
+            ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
+
+            int destOffset = row * 4;
+
+            ref Vector256<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock), destOffset);
+            ref Vector256<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock), destOffset);
+
+            var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
+            var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
+            Vector256<byte> rgb, rg, bx;
+            Vector256<float> r, g, b;
+
+            Span<Vector256<float>> rDataLanes = stackalloc Vector256<float>[4];
+            Span<Vector256<float>> gDataLanes = stackalloc Vector256<float>[4];
+            Span<Vector256<float>> bDataLanes = stackalloc Vector256<float>[4];
+
+            const int bytesPerRgbStride = 24;
+            for (int i = 0; i < 4; i++)
+            {
+                // 16x2 => 8x1
+                for (int j = 0; j < 4; j++)
+                {
+                    rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
+
+                    rgb = Avx2.Shuffle(rgb, extractRgbMask);
+
+                    rg = Avx2.UnpackLow(rgb, zero);
+                    bx = Avx2.UnpackHigh(rgb, zero);
+
+                    r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
+                    g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
+                    b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
+
+                    int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1);
+
+                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
+                    Unsafe.Add(ref yBlocks[j & 1].V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
+
+                    rDataLanes[j] = r;
+                    gDataLanes[j] = g;
+                    bDataLanes[j] = b;
+                }
+
+                r = Scale_8x4_4x2(rDataLanes);
+                g = Scale_8x4_4x2(gDataLanes);
+                b = Scale_8x4_4x2(bDataLanes);
+
+                // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+                Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
+
+                // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+                Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
+            }
+#endif
+        }
+
+
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static Vector256<float> Scale_8x4_4x2(Span<Vector256<float>> v)
        {
            Vector256<int> switchInnerDoubleWords = Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32));
            var f025 = Vector256.Create(0.25f);

-            Vector256<float> topPairSum = SumHorizontalPairs(v[0], v[1]);
-            Vector256<float> botPairSum = SumHorizontalPairs(v[2], v[3]);
+            Vector256<float> topPairSum = SumHorizontalPairs(v[0], v[2]);
+            Vector256<float> botPairSum = SumHorizontalPairs(v[1], v[3]);

            return Avx2.PermuteVar8x32(Avx.Multiply(SumVerticalPairs(topPairSum, botPairSum), f025), switchInnerDoubleWords);
        }
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@ -46,14 +46,20 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        /// </summary>
        private Span<Rgb24> rgbSpan;

+        public Span<Block8x8F> twinBlocksY;
+
        public static YCbCrForwardConverter<TPixel> Create()
        {
            var result = default(YCbCrForwardConverter<TPixel>);

            // creating rgb pixel bufferr
            // TODO: this is subject to discuss
-            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[200].AsSpan());
-            result.pixelSpan = new TPixel[64].AsSpan();
+            const int twoBlocksByteSizeWithPadding = 384 + 8; // converter.Convert comments for +8 padding
+            result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[twoBlocksByteSizeWithPadding].AsSpan());
+            // TODO: this size should be configurable
+            result.pixelSpan = new TPixel[128].AsSpan();
+
+            result.twinBlocksY = new Block8x8F[2].AsSpan();

            // Avoid creating lookup tables, when vectorized converter is supported
            if (!RgbToYCbCrConverterVectorized.IsSupported)
@ -70,7 +76,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows)
        {
            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
-            LoadAndStretchEdges(currentRows, this.pixelSpan, x, buffer.Width - x, buffer.Height - y);
+            LoadAndStretchEdges(currentRows, this.pixelSpan, x, buffer.Width - x, buffer.Height - y, new Size(8));

            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);

@ -94,13 +100,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        public void Convert420(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, int idx)
        {
            Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
-            LoadAndStretchEdges(currentRows, this.pixelSpan, x, Math.Min(8, buffer.Width - x), Math.Min(8, buffer.Height - y));
+            LoadAndStretchEdges(currentRows, this.pixelSpan, x, Math.Min(16, buffer.Width - x), Math.Min(8, buffer.Height - y), new Size(16, 8));

            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);

            if (RgbToYCbCrConverterVectorized.IsSupported)
            {
-                RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.Y, ref this.Cb, ref this.Cr, idx);
+                RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, this.twinBlocksY, ref this.Cb, ref this.Cr, idx);
            }
            else
            {
@ -110,7 +116,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        }

        // TODO: add DebugGuard checks?
-        private static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, int startX, int width, int height)
+        private static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, int startX, int width, int height, Size areaSize)
        {
            //Guard.MustBeBetweenOrEqualTo(width, 1, 8, nameof(width));
            //Guard.MustBeBetweenOrEqualTo(height, 1, 8, nameof(width));
@ -122,10 +128,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
            }

            uint byteWidth = (uint)(width * Unsafe.SizeOf<TPixel>());
-            int remainderXCount = 8 - width;
+            int remainderXCount = areaSize.Width - width;

            ref byte blockStart = ref MemoryMarshal.GetReference(MemoryMarshal.Cast<TPixel, byte>(dest));
-            int rowSizeInBytes = 8 * Unsafe.SizeOf<TPixel>();
+            int rowSizeInBytes = areaSize.Width * Unsafe.SizeOf<TPixel>();

            for (int y = 0; y < height; y++)
            {
@ -144,7 +150,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                }
            }

-            int remainderYCount = 8 - height;
+            int remainderYCount = areaSize.Height - height;

            if (remainderYCount == 0)
            {