Browse Source

Merge branch 'convert420-performance' into jpeg-decode-encode-optimization

pull/1632/head
Dmitry Pentin 5 years ago
parent
commit
25437ad003
  1. 37
      src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
  2. 114
      src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
  3. 298
      src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
  4. 113
      src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
  5. 135
      src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
  6. 90
      src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
  7. 9
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
  8. 2
      tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
  9. 2
      tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs

37
src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs

@ -71,11 +71,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
// ReSharper disable once InconsistentNaming
int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
RowOctet<TPixel> currentRows = default;
var pixelConverter = new YCbCrForwardConverter444<TPixel>(frame);
for (int y = 0; y < pixels.Height; y += 8)
{
cancellationToken.ThrowIfCancellationRequested();
@ -83,7 +84,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
for (int x = 0; x < pixels.Width; x += 8)
{
pixelConverter.Convert(frame, x, y, ref currentRows);
pixelConverter.Convert(x, y, ref currentRows);
prevDCY = this.WriteBlock(
QuantIndex.Luminance,
@ -123,57 +124,53 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
where TPixel : unmanaged, IPixel<TPixel>
{
Block8x8F b = default;
Span<Block8x8F> cb = stackalloc Block8x8F[4];
Span<Block8x8F> cr = stackalloc Block8x8F[4];
var unzig = ZigZag.CreateUnzigTable();
var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
// ReSharper disable once InconsistentNaming
int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
RowOctet<TPixel> currentRows = default;
var pixelConverter = new YCbCrForwardConverter420<TPixel>(frame);
for (int y = 0; y < pixels.Height; y += 16)
{
cancellationToken.ThrowIfCancellationRequested();
for (int x = 0; x < pixels.Width; x += 16)
{
for (int i = 0; i < 4; i++)
for (int i = 0; i < 2; i++)
{
int xOff = (i & 1) * 8;
int yOff = (i & 2) * 4;
int yOff = i * 8;
currentRows.Update(pixelBuffer, y + yOff);
pixelConverter.Convert(frame, x + xOff, y + yOff, ref currentRows);
pixelConverter.Convert(x, y, ref currentRows, i);
cb[i] = pixelConverter.Cb;
cr[i] = pixelConverter.Cr;
prevDCY = this.WriteBlock(
QuantIndex.Luminance,
prevDCY,
ref pixelConverter.YLeft,
ref luminanceQuantTable,
ref unzig);
prevDCY = this.WriteBlock(
QuantIndex.Luminance,
prevDCY,
ref pixelConverter.Y,
ref pixelConverter.YRight,
ref luminanceQuantTable,
ref unzig);
}
Block8x8F.Scale16X16To8X8(ref b, cb);
prevDCCb = this.WriteBlock(
QuantIndex.Chrominance,
prevDCCb,
ref b,
ref pixelConverter.Cb,
ref chrominanceQuantTable,
ref unzig);
Block8x8F.Scale16X16To8X8(ref b, cr);
prevDCCr = this.WriteBlock(
QuantIndex.Chrominance,
prevDCCr,
ref b,
ref pixelConverter.Cr,
ref chrominanceQuantTable,
ref unzig);
}

114
src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs

@ -115,7 +115,41 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
}
public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void ConvertPixelInto(
int r,
int g,
int b,
ref Block8x8F yResult,
int i)
{
// float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
yResult[i] = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void ConvertPixelInto(int r, int g, int b, ref float yResult) =>
// float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
yResult = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void ConvertPixelInto(int r, int g, int b, ref float cbResult, ref float crResult)
{
// float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
cbResult = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
// float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
crResult = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
}
/// <summary>
/// Converts Rgb24 pixels into YCbCr color space with 4:4:4 subsampling sampling of luminance and chroma.
/// </summary>
/// <param name="rgbSpan">Span of Rgb24 pixel data</param>
/// <param name="yBlock">Resulting Y values block</param>
/// <param name="cbBlock">Resulting Cb values block</param>
/// <param name="crBlock">Resulting Cr values block</param>
public void Convert444(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
{
ref Rgb24 rgbStart = ref rgbSpan[0];
@ -134,6 +168,84 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
}
}
/// <summary>
/// Converts Rgb24 pixels into YCbCr color space with 4:2:0 subsampling of luminance and chroma.
/// </summary>
/// <remarks>Calculates 2 out of 4 luminance blocks and half of chroma blocks. This method must be called twice per 4x 8x8 DCT blocks with different row param.</remarks>
/// <param name="rgbSpan">Span of Rgb24 pixel data</param>
/// <param name="yBlockLeft">First or "left" resulting Y block</param>
/// <param name="yBlockRight">Second or "right" resulting Y block</param>
/// <param name="cbBlock">Resulting Cb values block</param>
/// <param name="crBlock">Resulting Cr values block</param>
/// <param name="row">Row index of the 16x16 block, 0 or 1</param>
public void Convert420(Span<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
{
DebugGuard.MustBeBetweenOrEqualTo(row, 0, 1, nameof(row));
ref float yBlockLeftRef = ref Unsafe.As<Block8x8F, float>(ref yBlockLeft);
ref float yBlockRightRef = ref Unsafe.As<Block8x8F, float>(ref yBlockRight);
// 0-31 or 32-63
// upper or lower part
int chromaWriteOffset = row * Block8x8F.Size / 2;
ref float cbBlockRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, float>(ref cbBlock), chromaWriteOffset);
ref float crBlockRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, float>(ref crBlock), chromaWriteOffset);
ref Rgb24 rgbStart = ref rgbSpan[0];
for (int i = 0; i < 8; i += 2)
{
// 8 pixels by 3 integers
Span<int> rgbTriplets = stackalloc int[24];
for (int j = 0; j < 2; j++)
{
int yBlockWriteOffset = (i + j) * 8;
ref Rgb24 stride = ref Unsafe.Add(ref rgbStart, (i + j) * 16);
// left
this.ConvertChunk420(ref stride, ref Unsafe.Add(ref yBlockLeftRef, yBlockWriteOffset), rgbTriplets);
// right
this.ConvertChunk420(ref Unsafe.Add(ref stride, 8), ref Unsafe.Add(ref yBlockRightRef, yBlockWriteOffset), rgbTriplets.Slice(12));
}
int writeIdx = 8 * (i / 2);
ref float cbWriteRef = ref Unsafe.Add(ref cbBlockRef, writeIdx);
ref float crWriteRef = ref Unsafe.Add(ref crBlockRef, writeIdx);
for (int j = 0; j < 8; j++)
{
int idx = j * 3;
this.ConvertPixelInto(
rgbTriplets[idx] / 4, // r
rgbTriplets[idx + 1] / 4, // g
rgbTriplets[idx + 2] / 4, // b
ref Unsafe.Add(ref cbWriteRef, j),
ref Unsafe.Add(ref crWriteRef, j));
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void ConvertChunk420(ref Rgb24 stride, ref float yBlock, Span<int> chromaRgbTriplet)
{
for (int k = 0; k < 8; k += 2)
{
ref float yBlockRef = ref Unsafe.Add(ref yBlock, k);
Rgb24 px0 = Unsafe.Add(ref stride, k);
Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
this.ConvertPixelInto(px0.R, px0.G, px0.B, ref yBlockRef);
this.ConvertPixelInto(px1.R, px1.G, px1.B, ref Unsafe.Add(ref yBlockRef, 1));
int idx = 3 * (k / 2);
chromaRgbTriplet[idx] += px0.R + px1.R;
chromaRgbTriplet[idx + 1] += px0.G + px1.G;
chromaRgbTriplet[idx + 2] += px0.B + px1.B;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Fix(float x)
=> (int)((x * (1L << ScaleBits)) + 0.5F);

298
src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs

@ -27,19 +27,27 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
}
}
public static int AvxRegisterRgbCompatibilityPadding
{
get
{
if (IsSupported)
{
return 8;
}
return 0;
}
}
#if SUPPORTS_RUNTIME_INTRINSICS
private static ReadOnlySpan<byte> MoveFirst24BytesToSeparateLanes => new byte[]
{
0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0,
3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0
};
private static ReadOnlySpan<byte> MoveLast24BytesToSeparateLanes => new byte[]
{
2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,
5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0
};
private static ReadOnlySpan<byte> ExtractRgb => new byte[]
{
0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF,
@ -47,6 +55,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
};
#endif
/// <summary>
/// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:4:4 subsampling
/// </summary>
/// <remarks>Total size of rgb span must be 200 bytes</remarks>
/// <param name="rgbSpan">Span of rgb pixels with size of 64</param>
/// <param name="yBlock">8x8 destination matrix of Luminance(Y) converted data</param>
/// <param name="cbBlock">8x8 destination matrix of Chrominance(Cb) converted data</param>
/// <param name="crBlock">8x8 destination matrix of Chrominance(Cr) converted data</param>
public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
{
Debug.Assert(IsSupported, "AVX2 is required to run this converter");
@ -63,7 +79,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
var f05 = Vector256.Create(0.5f);
var zero = Vector256.Create(0).AsByte();
ref Vector256<byte> inRef = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
ref Vector256<float> destYRef = ref yBlock.V0;
ref Vector256<float> destCbRef = ref cbBlock.V0;
ref Vector256<float> destCrRef = ref crBlock.V0;
@ -72,9 +88,31 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
Vector256<byte> rgb, rg, bx;
Vector256<float> r, g, b;
for (int i = 0; i < 7; i++)
// TODO: probably remove this after the draft
// rgbByteSpan contains 8 strides by 8 pixels each, thus 64 pixels total
// Strides are stored sequentially - one big span of 64 * 3 = 192 bytes
// Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits
// Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride:
// stride 0 0 - 192 -(+64bits)-> 256
// stride 1 192 - 384 -(+64bits)-> 448
// stride 2 384 - 576 -(+64bits)-> 640
// stride 3 576 - 768 -(+64bits)-> 832
// stride 4 768 - 960 -(+64bits)-> 1024
// stride 5 960 - 1152 -(+64bits)-> 1216
// stride 6 1152 - 1344 -(+64bits)-> 1408
// stride 7 1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION
//
// Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits
// This is not permitted - we are reading foreign memory
// That's why last stride is calculated outside of the for-loop loop with special extract shuffle mask involved
//
// Extra mask & separate stride:7 calculations can be eliminated by simply providing rgb pixel span of slightly bigger size than pixels data need:
// Total pixel data size is 192 bytes, avx registers need it to be 200 bytes
const int bytesPerRgbStride = 24;
for (int i = 0; i < 8; i++)
{
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * i)).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.Shuffle(rgb, extractRgbMask);
@ -94,27 +132,241 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
}
#endif
}
/// <summary>
/// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:2:0 subsampling
/// </summary>
/// <remarks>Total size of rgb span must be 200 bytes</remarks>
public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock, int idx)
{
Debug.Assert(IsSupported, "AVX2 is required to run this converter");
#if SUPPORTS_RUNTIME_INTRINSICS
var f0299 = Vector256.Create(0.299f);
var f0587 = Vector256.Create(0.587f);
var f0114 = Vector256.Create(0.114f);
var fn0168736 = Vector256.Create(-0.168736f);
var fn0331264 = Vector256.Create(-0.331264f);
var f128 = Vector256.Create(128f);
var fn0418688 = Vector256.Create(-0.418688f);
var fn0081312F = Vector256.Create(-0.081312F);
var f05 = Vector256.Create(0.5f);
var zero = Vector256.Create(0).AsByte();
ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
ref Vector256<float> destYRef = ref yBlock.V0;
int destOffset = (idx & 2) * 4 + (idx & 1);
extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes));
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.Shuffle(rgb, extractRgbMask);
ref Vector128<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref cbBlock), destOffset);
ref Vector128<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector128<float>>(ref crBlock), destOffset);
rg = Avx2.UnpackLow(rgb, zero);
bx = Avx2.UnpackHigh(rgb, zero);
var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
Vector256<byte> rgb, rg, bx;
Vector256<float> r, g, b;
Span<Vector256<float>> rDataLanes = stackalloc Vector256<float>[4];
Span<Vector256<float>> gDataLanes = stackalloc Vector256<float>[4];
Span<Vector256<float>> bDataLanes = stackalloc Vector256<float>[4];
const int bytesPerRgbStride = 24;
for (int i = 0; i < 2; i++)
{
// each 4 lanes - [0, 1, 2, 3] & [4, 5, 6, 7]
for (int j = 0; j < 4; j++)
{
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.Shuffle(rgb, extractRgbMask);
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
rg = Avx2.UnpackLow(rgb, zero);
bx = Avx2.UnpackHigh(rgb, zero);
// (0.299F * r) + (0.587F * g) + (0.114F * b);
Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
// (0.299F * r) + (0.587F * g) + (0.114F * b);
Unsafe.Add(ref destYRef, i * 4 + j) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
rDataLanes[j] = r;
gDataLanes[j] = g;
bDataLanes[j] = b;
}
int localDestOffset = (i & 1) * 4;
r = Scale_8x4_4x2(rDataLanes);
g = Scale_8x4_4x2(gDataLanes);
b = Scale_8x4_4x2(bDataLanes);
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
Vector256<float> cb = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
Unsafe.Add(ref destCbRef, localDestOffset) = cb.GetLower();
Unsafe.Add(ref destCbRef, localDestOffset + 2) = cb.GetUpper();
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Vector256<float> cr = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
Unsafe.Add(ref destCrRef, localDestOffset) = cr.GetLower();
Unsafe.Add(ref destCrRef, localDestOffset + 2) = cr.GetUpper();
}
#endif
}
/// <summary>
/// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
/// </summary>
public static void Convert420_16x8(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
{
Debug.Assert(IsSupported, "AVX2 is required to run this converter");
#if SUPPORTS_RUNTIME_INTRINSICS
var f0299 = Vector256.Create(0.299f);
var f0587 = Vector256.Create(0.587f);
var f0114 = Vector256.Create(0.114f);
var fn0168736 = Vector256.Create(-0.168736f);
var fn0331264 = Vector256.Create(-0.331264f);
var f128 = Vector256.Create(128f);
var fn0418688 = Vector256.Create(-0.418688f);
var fn0081312F = Vector256.Create(-0.081312F);
var f05 = Vector256.Create(0.5f);
var zero = Vector256.Create(0).AsByte();
ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
int destOffset = row * 4;
ref Vector256<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock), destOffset);
ref Vector256<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock), destOffset);
var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
Vector256<byte> rgb, rg, bx;
Vector256<float> r, g, b;
Span<Vector256<float>> rDataLanes = stackalloc Vector256<float>[4];
Span<Vector256<float>> gDataLanes = stackalloc Vector256<float>[4];
Span<Vector256<float>> bDataLanes = stackalloc Vector256<float>[4];
const int bytesPerRgbStride = 24;
for (int i = 0; i < 4; i++)
{
// 16x2 => 8x1
// left 8x8 column conversions
for (int j = 0; j < 4; j += 2)
{
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.Shuffle(rgb, extractRgbMask);
rg = Avx2.UnpackLow(rgb, zero);
bx = Avx2.UnpackHigh(rgb, zero);
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1);
// (0.299F * r) + (0.587F * g) + (0.114F * b);
Unsafe.Add(ref yBlockLeft.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
rDataLanes[j] = r;
gDataLanes[j] = g;
bDataLanes[j] = b;
}
// 16x2 => 8x1
// right 8x8 column conversions
for (int j = 1; j < 4; j += 2)
{
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.Shuffle(rgb, extractRgbMask);
rg = Avx2.UnpackLow(rgb, zero);
bx = Avx2.UnpackHigh(rgb, zero);
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1);
// (0.299F * r) + (0.587F * g) + (0.114F * b);
Unsafe.Add(ref yBlockRight.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
rDataLanes[j] = r;
gDataLanes[j] = g;
bDataLanes[j] = b;
}
r = Scale_8x4_4x2(rDataLanes);
g = Scale_8x4_4x2(gDataLanes);
b = Scale_8x4_4x2(bDataLanes);
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
}
#endif
}
#if SUPPORTS_RUNTIME_INTRINSICS
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> Scale_8x4_4x2(Span<Vector256<float>> v)
{
Vector256<int> switchInnerDoubleWords = Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32));
var f025 = Vector256.Create(0.25f);
Vector256<float> topPairSum = SumHorizontalPairs(v[0], v[2]);
Vector256<float> botPairSum = SumHorizontalPairs(v[1], v[3]);
return Avx2.PermuteVar8x32(Avx.Multiply(SumVerticalPairs(topPairSum, botPairSum), f025), switchInnerDoubleWords);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> SumHorizontalPairs(Vector256<float> v0, Vector256<float> v1)
=> Avx.Add(Avx.Shuffle(v0, v1, 0b10_00_10_00), Avx.Shuffle(v0, v1, 0b11_01_11_01));
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> SumVerticalPairs(Vector256<float> v0, Vector256<float> v1)
=> Avx.Add(Avx.Shuffle(v0, v1, 0b01_00_01_00), Avx.Shuffle(v0, v1, 0b11_10_11_10));
public static void ConvertCbCr(ref Block8x8F rBlock, ref Block8x8F gBlock, ref Block8x8F bBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
{
var fn0168736 = Vector256.Create(-0.168736f);
var fn0331264 = Vector256.Create(-0.331264f);
var f128 = Vector256.Create(128f);
var fn0418688 = Vector256.Create(-0.418688f);
var fn0081312F = Vector256.Create(-0.081312F);
var f05 = Vector256.Create(0.5f);
ref Vector256<float> destCbRef = ref cbBlock.V0;
ref Vector256<float> destCrRef = ref crBlock.V0;
ref Vector256<float> rRef = ref rBlock.V0;
ref Vector256<float> gRef = ref gBlock.V0;
ref Vector256<float> bRef = ref bBlock.V0;
for (int i = 0; i < 8; i++)
{
ref Vector256<float> r = ref Unsafe.Add(ref rRef, i);
ref Vector256<float> g = ref Unsafe.Add(ref gRef, i);
ref Vector256<float> b = ref Unsafe.Add(ref bRef, i);
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
}
}
#endif
}
}

113
src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs

@ -0,0 +1,113 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Advanced;
using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
{
/// <summary>
/// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
/// </summary>
/// <typeparam name="TPixel">The pixel type to work on</typeparam>
internal ref struct YCbCrForwardConverter420<TPixel>
where TPixel : unmanaged, IPixel<TPixel>
{
// TODO: docs
private const int PixelsPerSample = 16 * 8;
// TODO: docs
private static int RgbSpanByteSize = PixelsPerSample * 3;
// TODO: docs
private static readonly Size SampleSize = new Size(16, 8);
/// <summary>
/// The left Y component
/// </summary>
public Block8x8F YLeft;
/// <summary>
/// The left Y component
/// </summary>
public Block8x8F YRight;
/// <summary>
/// The Cb component
/// </summary>
public Block8x8F Cb;
/// <summary>
/// The Cr component
/// </summary>
public Block8x8F Cr;
/// <summary>
/// The color conversion tables
/// </summary>
private RgbToYCbCrConverterLut colorTables;
/// <summary>
/// Temporal 16x8 block to hold TPixel data
/// </summary>
private Span<TPixel> pixelSpan;
/// <summary>
/// Temporal RGB block
/// </summary>
private Span<Rgb24> rgbSpan;
// TODO: docs
private Size samplingAreaSize;
// TODO: docs
private Configuration config;
public YCbCrForwardConverter420(ImageFrame<TPixel> frame)
{
// matrices would be filled during convert calls
this.YLeft = default;
this.YRight = default;
this.Cb = default;
this.Cr = default;
// temporal pixel buffers
this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan());
// frame data
this.samplingAreaSize = new Size(frame.Width, frame.Height);
this.config = frame.GetConfiguration();
// conversion vector fallback data
if (!RgbToYCbCrConverterVectorized.IsSupported)
{
this.colorTables = RgbToYCbCrConverterLut.Create();
}
else
{
this.colorTables = default;
}
}
public void Convert(int x, int y, ref RowOctet<TPixel> currentRows, int idx)
{
YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize);
PixelOperations<TPixel>.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan);
if (RgbToYCbCrConverterVectorized.IsSupported)
{
RgbToYCbCrConverterVectorized.Convert420_16x8(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
}
else
{
this.colorTables.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
}
}
}
}

135
src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs

@ -0,0 +1,135 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Advanced;
using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
{
/// <summary>
/// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
/// </summary>
/// <typeparam name="TPixel">The pixel type to work on</typeparam>
internal ref struct YCbCrForwardConverter444<TPixel>
where TPixel : unmanaged, IPixel<TPixel>
{
// TODO: docs
private const int PixelsPerSample = 8 * 8;
// TODO: docs
private const int RgbSpanByteSize = PixelsPerSample * 3;
// TODO: docs
private static readonly Size SampleSize = new Size(8, 8);
/// <summary>
/// The Y component
/// </summary>
public Block8x8F Y;
/// <summary>
/// The Cb component
/// </summary>
public Block8x8F Cb;
/// <summary>
/// The Cr component
/// </summary>
public Block8x8F Cr;
/// <summary>
/// The color conversion tables
/// </summary>
private RgbToYCbCrConverterLut colorTables;
/// <summary>
/// Temporal 64-byte span to hold unconverted TPixel data
/// </summary>
private Span<TPixel> pixelSpan;
/// <summary>
/// Temporal 64-byte span to hold converted Rgb24 data
/// </summary>
private Span<Rgb24> rgbSpan;
// TODO: docs
private Size samplingAreaSize;
// TODO: docs
private readonly Configuration config;
public YCbCrForwardConverter444(ImageFrame<TPixel> frame)
{
// matrices would be filled during convert calls
this.Y = default;
this.Cb = default;
this.Cr = default;
// temporal pixel buffers
this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxRegisterRgbCompatibilityPadding].AsSpan());
// frame data
this.samplingAreaSize = new Size(frame.Width, frame.Height);
this.config = frame.GetConfiguration();
// conversion vector fallback data
if (!RgbToYCbCrConverterVectorized.IsSupported)
{
this.colorTables = RgbToYCbCrConverterLut.Create();
}
else
{
this.colorTables = default;
}
}
public static YCbCrForwardConverter444<TPixel> Create()
{
var result = default(YCbCrForwardConverter444<TPixel>);
// creating rgb pixel bufferr
// TODO: this is subject to discuss
// converter.Convert comments for +8 padding
result.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + 8].AsSpan());
// TODO: this is subject to discuss
result.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
// Avoid creating lookup tables, when vectorized converter is supported
if (!RgbToYCbCrConverterVectorized.IsSupported)
{
result.colorTables = RgbToYCbCrConverterLut.Create();
}
return result;
}
/// <summary>
/// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
/// </summary>
public void Convert(int x, int y, ref RowOctet<TPixel> currentRows)
{
YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize);
PixelOperations<TPixel>.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan);
ref Block8x8F yBlock = ref this.Y;
ref Block8x8F cbBlock = ref this.Cb;
ref Block8x8F crBlock = ref this.Cr;
if (RgbToYCbCrConverterVectorized.IsSupported)
{
RgbToYCbCrConverterVectorized.Convert(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
}
else
{
this.colorTables.Convert444(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
}
}
}
}

90
src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs

@ -2,81 +2,59 @@
// Licensed under the Apache License, Version 2.0.
using System;
using SixLabors.ImageSharp.Advanced;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
{
/// <summary>
/// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
/// </summary>
/// <typeparam name="TPixel">The pixel type to work on</typeparam>
internal ref struct YCbCrForwardConverter<TPixel>
internal static class YCbCrForwardConverter<TPixel>
where TPixel : unmanaged, IPixel<TPixel>
{
/// <summary>
/// The Y component
/// </summary>
public Block8x8F Y;
/// <summary>
/// The Cb component
/// </summary>
public Block8x8F Cb;
/// <summary>
/// The Cr component
/// </summary>
public Block8x8F Cr;
public static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, Point start, Size sampleSize, Size totalSize)
{
DebugGuard.MustBeBetweenOrEqualTo(start.X, 1, totalSize.Width - 1, nameof(start.X));
DebugGuard.MustBeBetweenOrEqualTo(start.Y, 1, totalSize.Height - 1, nameof(start.Y));
/// <summary>
/// The color conversion tables
/// </summary>
private RgbToYCbCrConverterLut colorTables;
int width = Math.Min(sampleSize.Width, totalSize.Width - start.X);
int height = Math.Min(sampleSize.Height, totalSize.Height - start.Y);
/// <summary>
/// Temporal 8x8 block to hold TPixel data
/// </summary>
private GenericBlock8x8<TPixel> pixelBlock;
uint byteWidth = (uint)(width * Unsafe.SizeOf<TPixel>());
int remainderXCount = sampleSize.Width - width;
/// <summary>
/// Temporal RGB block
/// </summary>
private GenericBlock8x8<Rgb24> rgbBlock;
ref byte blockStart = ref MemoryMarshal.GetReference(MemoryMarshal.Cast<TPixel, byte>(dest));
int rowSizeInBytes = sampleSize.Width * Unsafe.SizeOf<TPixel>();
public static YCbCrForwardConverter<TPixel> Create()
{
var result = default(YCbCrForwardConverter<TPixel>);
if (!RgbToYCbCrConverterVectorized.IsSupported)
for (int y = 0; y < height; y++)
{
// Avoid creating lookup tables, when vectorized converter is supported
result.colorTables = RgbToYCbCrConverterLut.Create();
}
Span<TPixel> row = source[y];
return result;
}
ref byte s = ref Unsafe.As<TPixel, byte>(ref row[start.X]);
ref byte d = ref Unsafe.Add(ref blockStart, y * rowSizeInBytes);
/// <summary>
/// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
/// </summary>
public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows)
{
this.pixelBlock.LoadAndStretchEdges(frame.PixelBuffer, x, y, ref currentRows);
Unsafe.CopyBlock(ref d, ref s, byteWidth);
ref TPixel last = ref Unsafe.Add(ref Unsafe.As<byte, TPixel>(ref d), width - 1);
Span<Rgb24> rgbSpan = this.rgbBlock.AsSpanUnsafe();
PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelBlock.AsSpanUnsafe(), rgbSpan);
for (int x = 1; x <= remainderXCount; x++)
{
Unsafe.Add(ref last, x) = last;
}
}
ref Block8x8F yBlock = ref this.Y;
ref Block8x8F cbBlock = ref this.Cb;
ref Block8x8F crBlock = ref this.Cr;
int remainderYCount = sampleSize.Height - height;
if (RgbToYCbCrConverterVectorized.IsSupported)
if (remainderYCount == 0)
{
RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
return;
}
else
ref byte lastRowStart = ref Unsafe.Add(ref blockStart, (height - 1) * rowSizeInBytes);
for (int y = 1; y <= remainderYCount; y++)
{
this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
ref byte remStart = ref Unsafe.Add(ref lastRowStart, rowSizeInBytes * y);
Unsafe.CopyBlock(ref remStart, ref lastRowStart, (uint)rowSizeInBytes);
}
}
}

9
tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs

@ -13,9 +13,10 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
{
public class EncodeJpeg
{
[Params(50, 75, 95, 100)]
public int Quality;
private const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr;
// GDI+ most likely uses 75 as default quality - https://stackoverflow.com/questions/3957477/what-quality-level-does-image-save-use-for-jpeg-files
private const int EncodingQuality = 75;
// GDI+ uses 4:2:0 subsampling
private const JpegSubsample EncodingSubsampling = JpegSubsample.Ratio420;
@ -41,14 +42,14 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
this.bmpCore = Image.Load<Rgba32>(this.bmpStream);
this.bmpCore.Metadata.ExifProfile = null;
this.encoder = new JpegEncoder { Quality = EncodingQuality, Subsample = EncodingSubsampling };
this.encoder = new JpegEncoder { Quality = Quality, Subsample = EncodingSubsampling };
this.bmpStream.Position = 0;
this.bmpDrawing = SDImage.FromStream(this.bmpStream);
this.jpegCodec = GetEncoder(ImageFormat.Jpeg);
this.encoderParameters = new EncoderParameters(1);
// Quality cast to long is necessary
this.encoderParameters.Param[0] = new EncoderParameter(Encoder.Quality, (long)EncodingQuality);
this.encoderParameters.Param[0] = new EncoderParameter(Encoder.Quality, (long)Quality);
this.destinationStream = new MemoryStream();
}

2
tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs

@ -37,7 +37,7 @@ namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder
Block8x8F cb = default;
Block8x8F cr = default;
this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
this.converter.Convert444(this.data.AsSpan(), ref y, ref cb, ref cr);
}
[Benchmark]

2
tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs

@ -32,7 +32,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
Block8x8F cb = default;
Block8x8F cr = default;
target.Convert(data.AsSpan(), ref y, ref cb, ref cr);
target.Convert444(data.AsSpan(), ref y, ref cb, ref cr);
Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F));
}

Loading…
Cancel
Save