Browse Source

[WIP] Implemented 16x8 420 subsampling convertion

pull/1632/head
Dmitry Pentin 5 years ago
parent
commit
d50e255c85
  1. 19
      src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
  2. 86
      src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
  3. 24
      src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs

19
src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs

@ -123,8 +123,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
where TPixel : unmanaged, IPixel<TPixel>
{
Span<Block8x8F> temporalBlocks = stackalloc Block8x8F[2];
var unzig = ZigZag.CreateUnzigTable();
var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
@ -140,18 +138,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
cancellationToken.ThrowIfCancellationRequested();
for (int x = 0; x < pixels.Width; x += 16)
{
for (int i = 0; i < 4; i++)
for(int i = 0; i < 2; i++)
{
int xOff = (i & 1) * 8;
int yOff = (i & 2) * 4;
int yOff = i * 8;
currentRows.Update(pixelBuffer, y + yOff);
pixelConverter.Convert420(frame, x + xOff, y + yOff, ref currentRows, i);
pixelConverter.Convert420(frame, x, y, ref currentRows, i);
prevDCY = this.WriteBlock(
QuantIndex.Luminance,
prevDCY,
ref pixelConverter.twinBlocksY[0],
ref luminanceQuantTable,
ref unzig);
prevDCY = this.WriteBlock(
QuantIndex.Luminance,
prevDCY,
ref pixelConverter.Y,
ref pixelConverter.twinBlocksY[1],
ref luminanceQuantTable,
ref unzig);
}

86
src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs

@ -204,14 +204,96 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
#endif
}
/// <summary>
/// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
/// </summary>
/// <param name="rgbSpan"></param>
/// <param name="yBlock0"></param>
/// <param name="yBlock1"></param>
/// <param name="cbBlock"></param>
/// <param name="crBlock"></param>
/// <param name="row"></param>
public static void Convert420_16x8(ReadOnlySpan<Rgb24> rgbSpan, Span<Block8x8F> yBlocks, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
{
Debug.Assert(IsSupported, "AVX2 is required to run this converter");
#if SUPPORTS_RUNTIME_INTRINSICS
var f0299 = Vector256.Create(0.299f);
var f0587 = Vector256.Create(0.587f);
var f0114 = Vector256.Create(0.114f);
var fn0168736 = Vector256.Create(-0.168736f);
var fn0331264 = Vector256.Create(-0.331264f);
var f128 = Vector256.Create(128f);
var fn0418688 = Vector256.Create(-0.418688f);
var fn0081312F = Vector256.Create(-0.081312F);
var f05 = Vector256.Create(0.5f);
var zero = Vector256.Create(0).AsByte();
ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
int destOffset = row * 4;
ref Vector256<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock), destOffset);
ref Vector256<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock), destOffset);
var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
Vector256<byte> rgb, rg, bx;
Vector256<float> r, g, b;
Span<Vector256<float>> rDataLanes = stackalloc Vector256<float>[4];
Span<Vector256<float>> gDataLanes = stackalloc Vector256<float>[4];
Span<Vector256<float>> bDataLanes = stackalloc Vector256<float>[4];
const int bytesPerRgbStride = 24;
for (int i = 0; i < 4; i++)
{
// 16x2 => 8x1
for (int j = 0; j < 4; j++)
{
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.Shuffle(rgb, extractRgbMask);
rg = Avx2.UnpackLow(rgb, zero);
bx = Avx2.UnpackHigh(rgb, zero);
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1);
// (0.299F * r) + (0.587F * g) + (0.114F * b);
Unsafe.Add(ref yBlocks[j & 1].V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
rDataLanes[j] = r;
gDataLanes[j] = g;
bDataLanes[j] = b;
}
r = Scale_8x4_4x2(rDataLanes);
g = Scale_8x4_4x2(gDataLanes);
b = Scale_8x4_4x2(bDataLanes);
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
}
#endif
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> Scale_8x4_4x2(Span<Vector256<float>> v)
{
Vector256<int> switchInnerDoubleWords = Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32));
var f025 = Vector256.Create(0.25f);
Vector256<float> topPairSum = SumHorizontalPairs(v[0], v[1]);
Vector256<float> botPairSum = SumHorizontalPairs(v[2], v[3]);
Vector256<float> topPairSum = SumHorizontalPairs(v[0], v[2]);
Vector256<float> botPairSum = SumHorizontalPairs(v[1], v[3]);
return Avx2.PermuteVar8x32(Avx.Multiply(SumVerticalPairs(topPairSum, botPairSum), f025), switchInnerDoubleWords);
}

24
src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs

@ -46,14 +46,20 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
/// </summary>
private Span<Rgb24> rgbSpan;
public Span<Block8x8F> twinBlocksY;
public static YCbCrForwardConverter<TPixel> Create()
{
var result = default(YCbCrForwardConverter<TPixel>);
// creating rgb pixel bufferr
// TODO: this is subject to discuss
result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[200].AsSpan());
result.pixelSpan = new TPixel[64].AsSpan();
const int twoBlocksByteSizeWithPadding = 384 + 8; // converter.Convert comments for +8 padding
result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[twoBlocksByteSizeWithPadding].AsSpan());
// TODO: this size should be configurable
result.pixelSpan = new TPixel[128].AsSpan();
result.twinBlocksY = new Block8x8F[2].AsSpan();
// Avoid creating lookup tables, when vectorized converter is supported
if (!RgbToYCbCrConverterVectorized.IsSupported)
@ -70,7 +76,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows)
{
Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
LoadAndStretchEdges(currentRows, this.pixelSpan, x, buffer.Width - x, buffer.Height - y);
LoadAndStretchEdges(currentRows, this.pixelSpan, x, buffer.Width - x, buffer.Height - y, new Size(8));
PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
@ -94,13 +100,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
public void Convert420(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows, int idx)
{
Memory.Buffer2D<TPixel> buffer = frame.PixelBuffer;
LoadAndStretchEdges(currentRows, this.pixelSpan, x, Math.Min(8, buffer.Width - x), Math.Min(8, buffer.Height - y));
LoadAndStretchEdges(currentRows, this.pixelSpan, x, Math.Min(16, buffer.Width - x), Math.Min(8, buffer.Height - y), new Size(16, 8));
PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelSpan, this.rgbSpan);
if (RgbToYCbCrConverterVectorized.IsSupported)
{
RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.Y, ref this.Cb, ref this.Cr, idx);
RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, this.twinBlocksY, ref this.Cb, ref this.Cr, idx);
}
else
{
@ -110,7 +116,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
}
// TODO: add DebugGuard checks?
private static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, int startX, int width, int height)
private static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, int startX, int width, int height, Size areaSize)
{
//Guard.MustBeBetweenOrEqualTo(width, 1, 8, nameof(width));
//Guard.MustBeBetweenOrEqualTo(height, 1, 8, nameof(width));
@ -122,10 +128,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
}
uint byteWidth = (uint)(width * Unsafe.SizeOf<TPixel>());
int remainderXCount = 8 - width;
int remainderXCount = areaSize.Width - width;
ref byte blockStart = ref MemoryMarshal.GetReference(MemoryMarshal.Cast<TPixel, byte>(dest));
int rowSizeInBytes = 8 * Unsafe.SizeOf<TPixel>();
int rowSizeInBytes = areaSize.Width * Unsafe.SizeOf<TPixel>();
for (int y = 0; y < height; y++)
{
@ -144,7 +150,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
}
}
int remainderYCount = 8 - height;
int remainderYCount = areaSize.Height - height;
if (remainderYCount == 0)
{

Loading…
Cancel
Save