Converters tests/code cleanup, added comments for padding property

5 years ago · 8f79eb93c2
4 changed files with 39 additions and 152 deletions
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@ -27,15 +27,33 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
            }
        }
-        public static int AvxRegisterRgbCompatibilityPadding
+        public static int AvxCompatibilityPadding
        {
            // rgb byte matrices contain 8 strides by 8 pixels each, thus 64 pixels total
            // Strides are stored sequentially - one big span of 64 * 3 = 192 bytes
            // Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits
            // Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride:
            // stride 0    0    - 192  -(+64bits)-> 256
            // stride 1    192  - 384  -(+64bits)-> 448
            // stride 2    384  - 576  -(+64bits)-> 640
            // stride 3    576  - 768  -(+64bits)-> 832
            // stride 4    768  - 960  -(+64bits)-> 1024
            // stride 5    960  - 1152 -(+64bits)-> 1216
            // stride 6    1152 - 1344 -(+64bits)-> 1408
            // stride 7    1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION
            //
            // Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits
            // This is not permitted - we are reading foreign memory
            //
            // 8 byte padding to rgb byte span will solve this problem without extra code in converters
            get
            {
 #if SUPPORTS_RUNTIME_INTRINSICS
                if (IsSupported)
                {
                    return 8;
                }
-
+#endif
                return 0;
            }
        }
@ -89,26 +107,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
            Vector256<byte> rgb, rg, bx;
            Vector256<float> r, g, b;
            // TODO: probably remove this after the draft
            // rgbByteSpan contains 8 strides by 8 pixels each, thus 64 pixels total
            // Strides are stored sequentially - one big span of 64 * 3 = 192 bytes
            // Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits
            // Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride:
            // stride 0    0    - 192  -(+64bits)-> 256
            // stride 1    192  - 384  -(+64bits)-> 448
            // stride 2    384  - 576  -(+64bits)-> 640
            // stride 3    576  - 768  -(+64bits)-> 832
            // stride 4    768  - 960  -(+64bits)-> 1024
            // stride 5    960  - 1152 -(+64bits)-> 1216
            // stride 6    1152 - 1344 -(+64bits)-> 1408
            // stride 7    1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION
            //
            // Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits
            // This is not permitted - we are reading foreign memory
            // That's why last stride is calculated outside of the for-loop loop with special extract shuffle mask involved
            //
            // Extra mask & separate stride:7 calculations can be eliminated by simply providing rgb pixel span of slightly bigger size than pixels data need:
            // Total pixel data size is 192 bytes, avx registers need it to be 200 bytes
            const int bytesPerRgbStride = 24;
            for (int i = 0; i < 8; i++)
            {
@ -135,91 +133,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 #endif
        }
        /// <summary>
        /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:2:0 subsampling
        /// </summary>
        /// <remarks>Total size of rgb span must be 200 bytes</remarks>
        public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock, int idx)
        {
            Debug.Assert(IsSupported, "AVX2 is required to run this converter");
 #if SUPPORTS_RUNTIME_INTRINSICS
            var f0299 = Vector256.Create(0.299f);
            var f0587 = Vector256.Create(0.587f);
            var f0114 = Vector256.Create(0.114f);
            var fn0168736 = Vector256.Create(-0.168736f);
            var fn0331264 = Vector256.Create(-0.331264f);
            var f128 = Vector256.Create(128f);
            var fn0418688 = Vector256.Create(-0.418688f);
            var fn0081312F = Vector256.Create(-0.081312F);
            var f05 = Vector256.Create(0.5f);
            var zero = Vector256.Create(0).AsByte();
            ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
            ref Vector256<float> destYRef = ref yBlock.V0;
            int destOffset = (idx & 2) * 4 + (idx & 1);
            ref Vector128<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref cbBlock), destOffset);
            ref Vector128<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref crBlock), destOffset);
            var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
            var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
            Vector256<byte> rgb, rg, bx;
            Vector256<float> r, g, b;
            Span<Vector256<float>> rDataLanes = stackalloc Vector256<float>[4];
            Span<Vector256<float>> gDataLanes = stackalloc Vector256<float>[4];
            Span<Vector256<float>> bDataLanes = stackalloc Vector256<float>[4];
            const int bytesPerRgbStride = 24;
            for (int i = 0; i < 2; i++)
            {
                // each 4 lanes - [0, 1, 2, 3] & [4, 5, 6, 7]
                for (int j = 0; j < 4; j++)
                {
                    rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
                    rgb = Avx2.Shuffle(rgb, extractRgbMask);
                    rg = Avx2.UnpackLow(rgb, zero);
                    bx = Avx2.UnpackHigh(rgb, zero);
                    r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
                    g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
                    b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
                    Unsafe.Add(ref destYRef, i * 4 + j) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
                    rDataLanes[j] = r;
                    gDataLanes[j] = g;
                    bDataLanes[j] = b;
                }
                int localDestOffset = (i & 1) * 4;
                r = Scale_8x4_4x2(rDataLanes);
                g = Scale_8x4_4x2(gDataLanes);
                b = Scale_8x4_4x2(bDataLanes);
                // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
                Vector256<float> cb = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
                Unsafe.Add(ref destCbRef, localDestOffset) = cb.GetLower();
                Unsafe.Add(ref destCbRef, localDestOffset + 2) = cb.GetUpper();
                // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
                Vector256<float> cr = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
                Unsafe.Add(ref destCrRef, localDestOffset) = cr.GetLower();
                Unsafe.Add(ref destCrRef, localDestOffset + 2) = cr.GetUpper();
            }
 #endif
        }
        /// <summary>
        /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
        /// </summary>
-        public static void Convert420_16x8(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
+        public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
        {
            Debug.Assert(IsSupported, "AVX2 is required to run this converter");
@ -337,36 +254,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static Vector256<float> SumVerticalPairs(Vector256<float> v0, Vector256<float> v1)
            => Avx.Add(Avx.Shuffle(v0, v1, 0b01_00_01_00), Avx.Shuffle(v0, v1, 0b11_10_11_10));
        public static void ConvertCbCr(ref Block8x8F rBlock, ref Block8x8F gBlock, ref Block8x8F bBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
        {
            var fn0168736 = Vector256.Create(-0.168736f);
            var fn0331264 = Vector256.Create(-0.331264f);
            var f128 = Vector256.Create(128f);
            var fn0418688 = Vector256.Create(-0.418688f);
            var fn0081312F = Vector256.Create(-0.081312F);
            var f05 = Vector256.Create(0.5f);
            ref Vector256<float> destCbRef = ref cbBlock.V0;
            ref Vector256<float> destCrRef = ref crBlock.V0;
            ref Vector256<float> rRef = ref rBlock.V0;
            ref Vector256<float> gRef = ref gBlock.V0;
            ref Vector256<float> bRef = ref bBlock.V0;
            for (int i = 0; i < 8; i++)
            {
                ref Vector256<float> r = ref Unsafe.Add(ref rRef, i);
                ref Vector256<float> g = ref Unsafe.Add(ref gRef, i);
                ref Vector256<float> b = ref Unsafe.Add(ref bRef, i);
                // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
                Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
                // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
                Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
            }
        }
 #endif
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
@ -77,7 +77,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
            // temporal pixel buffers
            this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
-            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan());
+            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan());
            // frame data
            this.samplingAreaSize = new Size(frame.Width, frame.Height);
@ -102,7 +102,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
            if (RgbToYCbCrConverterVectorized.IsSupported)
            {
-                RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
+                RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
            }
            else
            {
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
            // temporal pixel buffers
            this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
-            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan());
+            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan());
            // frame data
            this.samplingAreaSize = new Size(frame.Width, frame.Height);
--- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
@ -92,8 +92,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            var cb = default(Block8x8F);
            var cr = default(Block8x8F);
-            RgbToYCbCrConverterVectorized.Convert420_16x8(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0);
+            RgbToYCbCrConverterVectorized.Convert420(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0);
-            RgbToYCbCrConverterVectorized.Convert420_16x8(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1);
+            RgbToYCbCrConverterVectorized.Convert420(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1);
            Verify420(data, yBlocks, ref cb, ref cr, new ApproximateFloatComparer(1F));
        }
@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            ref Block8x8F crResult,
            ApproximateFloatComparer comparer)
        {
-            var tempBlock = default(Block8x8F);
+            var trueBlock = default(Block8x8F);
            var cbTrue = new Block8x8F[4];
            var crTrue = new Block8x8F[4];
@ -133,31 +133,31 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
            // top left
            Copy8x8(data, tempData);
-            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[0], ref crTrue[0]);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[0], ref crTrue[0]);
-            VerifyBlock(ref yResult[0], ref tempBlock, comparer);
+            VerifyBlock(ref yResult[0], ref trueBlock, comparer);
            // top right
            Copy8x8(data.Slice(8), tempData);
-            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[1], ref crTrue[1]);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[1], ref crTrue[1]);
-            VerifyBlock(ref yResult[1], ref tempBlock, comparer);
+            VerifyBlock(ref yResult[1], ref trueBlock, comparer);
            // bottom left
            Copy8x8(data.Slice(8 * 16), tempData);
-            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[2], ref crTrue[2]);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[2], ref crTrue[2]);
-            VerifyBlock(ref yResult[2], ref tempBlock, comparer);
+            VerifyBlock(ref yResult[2], ref trueBlock, comparer);
            // bottom right
            Copy8x8(data.Slice((8 * 16) + 8), tempData);
-            RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[3], ref crTrue[3]);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[3], ref crTrue[3]);
-            VerifyBlock(ref yResult[3], ref tempBlock, comparer);
+            VerifyBlock(ref yResult[3], ref trueBlock, comparer);
            // verify Cb
-            Scale16X16To8X8(ref tempBlock, cbTrue);
+            Scale16X16To8X8(ref trueBlock, cbTrue);
-            VerifyBlock(ref cbResult, ref tempBlock, comparer);
+            VerifyBlock(ref cbResult, ref trueBlock, comparer);
            // verify Cr
-            Scale16X16To8X8(ref tempBlock, crTrue);
+            Scale16X16To8X8(ref trueBlock, crTrue);
-            VerifyBlock(ref crResult, ref tempBlock, comparer);
+            VerifyBlock(ref crResult, ref trueBlock, comparer);
            // extracts 8x8 blocks from 16x8 memory region