diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs index 05a1b111f5..49b974404a 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -27,15 +27,33 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder } } - public static int AvxRegisterRgbCompatibilityPadding + public static int AvxCompatibilityPadding { + // rgb byte matrices contain 8 strides by 8 pixels each, thus 64 pixels total + // Strides are stored sequentially - one big span of 64 * 3 = 192 bytes + // Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits + // Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride: + // stride 0 0 - 192 -(+64bits)-> 256 + // stride 1 192 - 384 -(+64bits)-> 448 + // stride 2 384 - 576 -(+64bits)-> 640 + // stride 3 576 - 768 -(+64bits)-> 832 + // stride 4 768 - 960 -(+64bits)-> 1024 + // stride 5 960 - 1152 -(+64bits)-> 1216 + // stride 6 1152 - 1344 -(+64bits)-> 1408 + // stride 7 1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION + // + // Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits + // This is not permitted - we are reading foreign memory + // + // 8 byte padding to rgb byte span will solve this problem without extra code in converters get { +#if SUPPORTS_RUNTIME_INTRINSICS if (IsSupported) { return 8; } - +#endif return 0; } } @@ -89,26 +107,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder Vector256 rgb, rg, bx; Vector256 r, g, b; - // TODO: probably remove this after the draft - // rgbByteSpan contains 8 strides by 8 pixels each, thus 64 pixels total - // Strides are stored sequentially - one big span of 64 * 3 = 192 bytes - // Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits - // Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride: - // stride 0 0 - 192 -(+64bits)-> 256 - // stride 1 192 - 384 -(+64bits)-> 448 - // stride 2 384 - 576 -(+64bits)-> 640 - // stride 3 576 - 768 -(+64bits)-> 832 - // stride 4 768 - 960 -(+64bits)-> 1024 - // stride 5 960 - 1152 -(+64bits)-> 1216 - // stride 6 1152 - 1344 -(+64bits)-> 1408 - // stride 7 1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION - // - // Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits - // This is not permitted - we are reading foreign memory - // That's why last stride is calculated outside of the for-loop loop with special extract shuffle mask involved - // - // Extra mask & separate stride:7 calculations can be eliminated by simply providing rgb pixel span of slightly bigger size than pixels data need: - // Total pixel data size is 192 bytes, avx registers need it to be 200 bytes const int bytesPerRgbStride = 24; for (int i = 0; i < 8; i++) { @@ -135,91 +133,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder #endif } - /// - /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:2:0 subsampling - /// - /// Total size of rgb span must be 200 bytes - public static void Convert420(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock, int idx) - { - Debug.Assert(IsSupported, "AVX2 is required to run this converter"); - -#if SUPPORTS_RUNTIME_INTRINSICS - var f0299 = Vector256.Create(0.299f); - var f0587 = Vector256.Create(0.587f); - var f0114 = Vector256.Create(0.114f); - var fn0168736 = Vector256.Create(-0.168736f); - var fn0331264 = Vector256.Create(-0.331264f); - var f128 = Vector256.Create(128f); - var fn0418688 = Vector256.Create(-0.418688f); - var fn0081312F = Vector256.Create(-0.081312F); - var f05 = Vector256.Create(0.5f); - var zero = Vector256.Create(0).AsByte(); - - ref Vector256 rgbByteSpan = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); - ref Vector256 destYRef = ref yBlock.V0; - - int destOffset = (idx & 2) * 4 + (idx & 1); - - ref Vector128 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), destOffset); - ref Vector128 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), destOffset); - - var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); - var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); - Vector256 rgb, rg, bx; - Vector256 r, g, b; - - Span> rDataLanes = stackalloc Vector256[4]; - Span> gDataLanes = stackalloc Vector256[4]; - Span> bDataLanes = stackalloc Vector256[4]; - - const int bytesPerRgbStride = 24; - for (int i = 0; i < 2; i++) - { - // each 4 lanes - [0, 1, 2, 3] & [4, 5, 6, 7] - for (int j = 0; j < 4; j++) - { - rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte(); - - rgb = Avx2.Shuffle(rgb, extractRgbMask); - - rg = Avx2.UnpackLow(rgb, zero); - bx = Avx2.UnpackHigh(rgb, zero); - - r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); - g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); - b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); - - // (0.299F * r) + (0.587F * g) + (0.114F * b); - Unsafe.Add(ref destYRef, i * 4 + j) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); - - rDataLanes[j] = r; - gDataLanes[j] = g; - bDataLanes[j] = b; - } - - int localDestOffset = (i & 1) * 4; - - r = Scale_8x4_4x2(rDataLanes); - g = Scale_8x4_4x2(gDataLanes); - b = Scale_8x4_4x2(bDataLanes); - - // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) - Vector256 cb = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); - Unsafe.Add(ref destCbRef, localDestOffset) = cb.GetLower(); - Unsafe.Add(ref destCbRef, localDestOffset + 2) = cb.GetUpper(); - - // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) - Vector256 cr = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); - Unsafe.Add(ref destCrRef, localDestOffset) = cr.GetLower(); - Unsafe.Add(ref destCrRef, localDestOffset + 2) = cr.GetUpper(); - } -#endif - } - /// /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling /// - public static void Convert420_16x8(ReadOnlySpan rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row) + public static void Convert420(ReadOnlySpan rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row) { Debug.Assert(IsSupported, "AVX2 is required to run this converter"); @@ -337,36 +254,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 SumVerticalPairs(Vector256 v0, Vector256 v1) => Avx.Add(Avx.Shuffle(v0, v1, 0b01_00_01_00), Avx.Shuffle(v0, v1, 0b11_10_11_10)); - - public static void ConvertCbCr(ref Block8x8F rBlock, ref Block8x8F gBlock, ref Block8x8F bBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) - { - var fn0168736 = Vector256.Create(-0.168736f); - var fn0331264 = Vector256.Create(-0.331264f); - var f128 = Vector256.Create(128f); - var fn0418688 = Vector256.Create(-0.418688f); - var fn0081312F = Vector256.Create(-0.081312F); - var f05 = Vector256.Create(0.5f); - - ref Vector256 destCbRef = ref cbBlock.V0; - ref Vector256 destCrRef = ref crBlock.V0; - - ref Vector256 rRef = ref rBlock.V0; - ref Vector256 gRef = ref gBlock.V0; - ref Vector256 bRef = ref bBlock.V0; - - for (int i = 0; i < 8; i++) - { - ref Vector256 r = ref Unsafe.Add(ref rRef, i); - ref Vector256 g = ref Unsafe.Add(ref gRef, i); - ref Vector256 b = ref Unsafe.Add(ref bRef, i); - - // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) - Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); - - // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) - Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); - } - } #endif } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs index e0e7854b0d..9288acc7e0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs @@ -77,7 +77,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // temporal pixel buffers this.pixelSpan = new TPixel[PixelsPerSample].AsSpan(); - this.rgbSpan = MemoryMarshal.Cast(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan()); + this.rgbSpan = MemoryMarshal.Cast(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan()); // frame data this.samplingAreaSize = new Size(frame.Width, frame.Height); @@ -102,7 +102,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder if (RgbToYCbCrConverterVectorized.IsSupported) { - RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx); + RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx); } else { diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs index 0b7438725c..d611aaf9e1 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs @@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder // temporal pixel buffers this.pixelSpan = new TPixel[PixelsPerSample].AsSpan(); - this.rgbSpan = MemoryMarshal.Cast(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan()); + this.rgbSpan = MemoryMarshal.Cast(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan()); // frame data this.samplingAreaSize = new Size(frame.Width, frame.Height); diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs index 9ec1bf603e..d95191ffe1 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs @@ -92,8 +92,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg var cb = default(Block8x8F); var cr = default(Block8x8F); - RgbToYCbCrConverterVectorized.Convert420_16x8(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0); - RgbToYCbCrConverterVectorized.Convert420_16x8(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1); + RgbToYCbCrConverterVectorized.Convert420(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0); + RgbToYCbCrConverterVectorized.Convert420(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1); Verify420(data, yBlocks, ref cb, ref cr, new ApproximateFloatComparer(1F)); } @@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg ref Block8x8F crResult, ApproximateFloatComparer comparer) { - var tempBlock = default(Block8x8F); + var trueBlock = default(Block8x8F); var cbTrue = new Block8x8F[4]; var crTrue = new Block8x8F[4]; @@ -133,31 +133,31 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg // top left Copy8x8(data, tempData); - RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[0], ref crTrue[0]); - VerifyBlock(ref yResult[0], ref tempBlock, comparer); + RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[0], ref crTrue[0]); + VerifyBlock(ref yResult[0], ref trueBlock, comparer); // top right Copy8x8(data.Slice(8), tempData); - RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[1], ref crTrue[1]); - VerifyBlock(ref yResult[1], ref tempBlock, comparer); + RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[1], ref crTrue[1]); + VerifyBlock(ref yResult[1], ref trueBlock, comparer); // bottom left Copy8x8(data.Slice(8 * 16), tempData); - RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[2], ref crTrue[2]); - VerifyBlock(ref yResult[2], ref tempBlock, comparer); + RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[2], ref crTrue[2]); + VerifyBlock(ref yResult[2], ref trueBlock, comparer); // bottom right Copy8x8(data.Slice((8 * 16) + 8), tempData); - RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[3], ref crTrue[3]); - VerifyBlock(ref yResult[3], ref tempBlock, comparer); + RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[3], ref crTrue[3]); + VerifyBlock(ref yResult[3], ref trueBlock, comparer); // verify Cb - Scale16X16To8X8(ref tempBlock, cbTrue); - VerifyBlock(ref cbResult, ref tempBlock, comparer); + Scale16X16To8X8(ref trueBlock, cbTrue); + VerifyBlock(ref cbResult, ref trueBlock, comparer); // verify Cr - Scale16X16To8X8(ref tempBlock, crTrue); - VerifyBlock(ref crResult, ref tempBlock, comparer); + Scale16X16To8X8(ref trueBlock, crTrue); + VerifyBlock(ref crResult, ref trueBlock, comparer); // extracts 8x8 blocks from 16x8 memory region