Browse Source

Converters tests/code cleanup, added comments for padding property

pull/1632/head
Dmitry Pentin 5 years ago
parent
commit
8f79eb93c2
  1. 155
      src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
  2. 4
      src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
  3. 2
      src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
  4. 30
      tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs

155
src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs

@ -27,15 +27,33 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
}
}
public static int AvxRegisterRgbCompatibilityPadding
public static int AvxCompatibilityPadding
{
// rgb byte matrices contain 8 strides by 8 pixels each, thus 64 pixels total
// Strides are stored sequentially - one big span of 64 * 3 = 192 bytes
// Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits
// Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride:
// stride 0 0 - 192 -(+64bits)-> 256
// stride 1 192 - 384 -(+64bits)-> 448
// stride 2 384 - 576 -(+64bits)-> 640
// stride 3 576 - 768 -(+64bits)-> 832
// stride 4 768 - 960 -(+64bits)-> 1024
// stride 5 960 - 1152 -(+64bits)-> 1216
// stride 6 1152 - 1344 -(+64bits)-> 1408
// stride 7 1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION
//
// Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits
// This is not permitted - we are reading foreign memory
//
// 8 byte padding to rgb byte span will solve this problem without extra code in converters
get
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (IsSupported)
{
return 8;
}
#endif
return 0;
}
}
@ -89,26 +107,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
Vector256<byte> rgb, rg, bx;
Vector256<float> r, g, b;
// TODO: probably remove this after the draft
// rgbByteSpan contains 8 strides by 8 pixels each, thus 64 pixels total
// Strides are stored sequentially - one big span of 64 * 3 = 192 bytes
// Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits
// Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride:
// stride 0 0 - 192 -(+64bits)-> 256
// stride 1 192 - 384 -(+64bits)-> 448
// stride 2 384 - 576 -(+64bits)-> 640
// stride 3 576 - 768 -(+64bits)-> 832
// stride 4 768 - 960 -(+64bits)-> 1024
// stride 5 960 - 1152 -(+64bits)-> 1216
// stride 6 1152 - 1344 -(+64bits)-> 1408
// stride 7 1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION
//
// Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits
// This is not permitted - we are reading foreign memory
// That's why last stride is calculated outside of the for-loop loop with special extract shuffle mask involved
//
// Extra mask & separate stride:7 calculations can be eliminated by simply providing rgb pixel span of slightly bigger size than pixels data need:
// Total pixel data size is 192 bytes, avx registers need it to be 200 bytes
const int bytesPerRgbStride = 24;
for (int i = 0; i < 8; i++)
{
@ -135,91 +133,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
#endif
}
/// <summary>
/// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:2:0 subsampling
/// </summary>
/// <remarks>Total size of rgb span must be 200 bytes</remarks>
public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock, int idx)
{
Debug.Assert(IsSupported, "AVX2 is required to run this converter");
#if SUPPORTS_RUNTIME_INTRINSICS
var f0299 = Vector256.Create(0.299f);
var f0587 = Vector256.Create(0.587f);
var f0114 = Vector256.Create(0.114f);
var fn0168736 = Vector256.Create(-0.168736f);
var fn0331264 = Vector256.Create(-0.331264f);
var f128 = Vector256.Create(128f);
var fn0418688 = Vector256.Create(-0.418688f);
var fn0081312F = Vector256.Create(-0.081312F);
var f05 = Vector256.Create(0.5f);
var zero = Vector256.Create(0).AsByte();
ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
ref Vector256<float> destYRef = ref yBlock.V0;
int destOffset = (idx & 2) * 4 + (idx & 1);
ref Vector128<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref cbBlock), destOffset);
ref Vector128<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref crBlock), destOffset);
var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
Vector256<byte> rgb, rg, bx;
Vector256<float> r, g, b;
Span<Vector256<float>> rDataLanes = stackalloc Vector256<float>[4];
Span<Vector256<float>> gDataLanes = stackalloc Vector256<float>[4];
Span<Vector256<float>> bDataLanes = stackalloc Vector256<float>[4];
const int bytesPerRgbStride = 24;
for (int i = 0; i < 2; i++)
{
// each 4 lanes - [0, 1, 2, 3] & [4, 5, 6, 7]
for (int j = 0; j < 4; j++)
{
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.Shuffle(rgb, extractRgbMask);
rg = Avx2.UnpackLow(rgb, zero);
bx = Avx2.UnpackHigh(rgb, zero);
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
// (0.299F * r) + (0.587F * g) + (0.114F * b);
Unsafe.Add(ref destYRef, i * 4 + j) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
rDataLanes[j] = r;
gDataLanes[j] = g;
bDataLanes[j] = b;
}
int localDestOffset = (i & 1) * 4;
r = Scale_8x4_4x2(rDataLanes);
g = Scale_8x4_4x2(gDataLanes);
b = Scale_8x4_4x2(bDataLanes);
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
Vector256<float> cb = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
Unsafe.Add(ref destCbRef, localDestOffset) = cb.GetLower();
Unsafe.Add(ref destCbRef, localDestOffset + 2) = cb.GetUpper();
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Vector256<float> cr = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
Unsafe.Add(ref destCrRef, localDestOffset) = cr.GetLower();
Unsafe.Add(ref destCrRef, localDestOffset + 2) = cr.GetUpper();
}
#endif
}
/// <summary>
/// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
/// </summary>
public static void Convert420_16x8(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
{
Debug.Assert(IsSupported, "AVX2 is required to run this converter");
@ -337,36 +254,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> SumVerticalPairs(Vector256<float> v0, Vector256<float> v1)
=> Avx.Add(Avx.Shuffle(v0, v1, 0b01_00_01_00), Avx.Shuffle(v0, v1, 0b11_10_11_10));
public static void ConvertCbCr(ref Block8x8F rBlock, ref Block8x8F gBlock, ref Block8x8F bBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
{
var fn0168736 = Vector256.Create(-0.168736f);
var fn0331264 = Vector256.Create(-0.331264f);
var f128 = Vector256.Create(128f);
var fn0418688 = Vector256.Create(-0.418688f);
var fn0081312F = Vector256.Create(-0.081312F);
var f05 = Vector256.Create(0.5f);
ref Vector256<float> destCbRef = ref cbBlock.V0;
ref Vector256<float> destCrRef = ref crBlock.V0;
ref Vector256<float> rRef = ref rBlock.V0;
ref Vector256<float> gRef = ref gBlock.V0;
ref Vector256<float> bRef = ref bBlock.V0;
for (int i = 0; i < 8; i++)
{
ref Vector256<float> r = ref Unsafe.Add(ref rRef, i);
ref Vector256<float> g = ref Unsafe.Add(ref gRef, i);
ref Vector256<float> b = ref Unsafe.Add(ref bRef, i);
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
}
}
#endif
}
}

4
src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs

@ -77,7 +77,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
// temporal pixel buffers
this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan());
this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan());
// frame data
this.samplingAreaSize = new Size(frame.Width, frame.Height);
@ -102,7 +102,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
if (RgbToYCbCrConverterVectorized.IsSupported)
{
RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
}
else
{

2
src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs

@ -71,7 +71,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
// temporal pixel buffers
this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan());
this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan());
// frame data
this.samplingAreaSize = new Size(frame.Width, frame.Height);

30
tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs

@ -92,8 +92,8 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
var cb = default(Block8x8F);
var cr = default(Block8x8F);
RgbToYCbCrConverterVectorized.Convert420_16x8(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0);
RgbToYCbCrConverterVectorized.Convert420_16x8(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1);
RgbToYCbCrConverterVectorized.Convert420(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0);
RgbToYCbCrConverterVectorized.Convert420(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1);
Verify420(data, yBlocks, ref cb, ref cr, new ApproximateFloatComparer(1F));
}
@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
ref Block8x8F crResult,
ApproximateFloatComparer comparer)
{
var tempBlock = default(Block8x8F);
var trueBlock = default(Block8x8F);
var cbTrue = new Block8x8F[4];
var crTrue = new Block8x8F[4];
@ -133,31 +133,31 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
// top left
Copy8x8(data, tempData);
RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[0], ref crTrue[0]);
VerifyBlock(ref yResult[0], ref tempBlock, comparer);
RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[0], ref crTrue[0]);
VerifyBlock(ref yResult[0], ref trueBlock, comparer);
// top right
Copy8x8(data.Slice(8), tempData);
RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[1], ref crTrue[1]);
VerifyBlock(ref yResult[1], ref tempBlock, comparer);
RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[1], ref crTrue[1]);
VerifyBlock(ref yResult[1], ref trueBlock, comparer);
// bottom left
Copy8x8(data.Slice(8 * 16), tempData);
RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[2], ref crTrue[2]);
VerifyBlock(ref yResult[2], ref tempBlock, comparer);
RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[2], ref crTrue[2]);
VerifyBlock(ref yResult[2], ref trueBlock, comparer);
// bottom right
Copy8x8(data.Slice((8 * 16) + 8), tempData);
RgbToYCbCr(tempData, ref tempBlock, ref cbTrue[3], ref crTrue[3]);
VerifyBlock(ref yResult[3], ref tempBlock, comparer);
RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[3], ref crTrue[3]);
VerifyBlock(ref yResult[3], ref trueBlock, comparer);
// verify Cb
Scale16X16To8X8(ref tempBlock, cbTrue);
VerifyBlock(ref cbResult, ref tempBlock, comparer);
Scale16X16To8X8(ref trueBlock, cbTrue);
VerifyBlock(ref cbResult, ref trueBlock, comparer);
// verify Cr
Scale16X16To8X8(ref tempBlock, crTrue);
VerifyBlock(ref crResult, ref tempBlock, comparer);
Scale16X16To8X8(ref trueBlock, crTrue);
VerifyBlock(ref crResult, ref trueBlock, comparer);
// extracts 8x8 blocks from 16x8 memory region

Loading…
Cancel
Save