|
|
|
@ -204,14 +204,96 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder |
|
|
|
#endif
|
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="rgbSpan"></param>
|
|
|
|
/// <param name="yBlock0"></param>
|
|
|
|
/// <param name="yBlock1"></param>
|
|
|
|
/// <param name="cbBlock"></param>
|
|
|
|
/// <param name="crBlock"></param>
|
|
|
|
/// <param name="row"></param>
|
|
|
|
public static void Convert420_16x8(ReadOnlySpan<Rgb24> rgbSpan, Span<Block8x8F> yBlocks, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row) |
|
|
|
{ |
|
|
|
Debug.Assert(IsSupported, "AVX2 is required to run this converter"); |
|
|
|
|
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
var f0299 = Vector256.Create(0.299f); |
|
|
|
var f0587 = Vector256.Create(0.587f); |
|
|
|
var f0114 = Vector256.Create(0.114f); |
|
|
|
var fn0168736 = Vector256.Create(-0.168736f); |
|
|
|
var fn0331264 = Vector256.Create(-0.331264f); |
|
|
|
var f128 = Vector256.Create(128f); |
|
|
|
var fn0418688 = Vector256.Create(-0.418688f); |
|
|
|
var fn0081312F = Vector256.Create(-0.081312F); |
|
|
|
var f05 = Vector256.Create(0.5f); |
|
|
|
var zero = Vector256.Create(0).AsByte(); |
|
|
|
|
|
|
|
ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan)); |
|
|
|
|
|
|
|
int destOffset = row * 4; |
|
|
|
|
|
|
|
ref Vector256<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock), destOffset); |
|
|
|
ref Vector256<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock), destOffset); |
|
|
|
|
|
|
|
var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); |
|
|
|
var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb)); |
|
|
|
Vector256<byte> rgb, rg, bx; |
|
|
|
Vector256<float> r, g, b; |
|
|
|
|
|
|
|
Span<Vector256<float>> rDataLanes = stackalloc Vector256<float>[4]; |
|
|
|
Span<Vector256<float>> gDataLanes = stackalloc Vector256<float>[4]; |
|
|
|
Span<Vector256<float>> bDataLanes = stackalloc Vector256<float>[4]; |
|
|
|
|
|
|
|
const int bytesPerRgbStride = 24; |
|
|
|
for (int i = 0; i < 4; i++) |
|
|
|
{ |
|
|
|
// 16x2 => 8x1
|
|
|
|
for (int j = 0; j < 4; j++) |
|
|
|
{ |
|
|
|
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte(); |
|
|
|
|
|
|
|
rgb = Avx2.Shuffle(rgb, extractRgbMask); |
|
|
|
|
|
|
|
rg = Avx2.UnpackLow(rgb, zero); |
|
|
|
bx = Avx2.UnpackHigh(rgb, zero); |
|
|
|
|
|
|
|
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); |
|
|
|
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); |
|
|
|
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); |
|
|
|
|
|
|
|
int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1); |
|
|
|
|
|
|
|
// (0.299F * r) + (0.587F * g) + (0.114F * b);
|
|
|
|
Unsafe.Add(ref yBlocks[j & 1].V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); |
|
|
|
|
|
|
|
rDataLanes[j] = r; |
|
|
|
gDataLanes[j] = g; |
|
|
|
bDataLanes[j] = b; |
|
|
|
} |
|
|
|
|
|
|
|
r = Scale_8x4_4x2(rDataLanes); |
|
|
|
g = Scale_8x4_4x2(gDataLanes); |
|
|
|
b = Scale_8x4_4x2(bDataLanes); |
|
|
|
|
|
|
|
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
|
|
|
|
Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); |
|
|
|
|
|
|
|
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
|
|
|
|
Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); |
|
|
|
} |
|
|
|
#endif
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
public static Vector256<float> Scale_8x4_4x2(Span<Vector256<float>> v) |
|
|
|
{ |
|
|
|
Vector256<int> switchInnerDoubleWords = Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32)); |
|
|
|
var f025 = Vector256.Create(0.25f); |
|
|
|
|
|
|
|
Vector256<float> topPairSum = SumHorizontalPairs(v[0], v[1]); |
|
|
|
Vector256<float> botPairSum = SumHorizontalPairs(v[2], v[3]); |
|
|
|
Vector256<float> topPairSum = SumHorizontalPairs(v[0], v[2]); |
|
|
|
Vector256<float> botPairSum = SumHorizontalPairs(v[1], v[3]); |
|
|
|
|
|
|
|
return Avx2.PermuteVar8x32(Avx.Multiply(SumVerticalPairs(topPairSum, botPairSum), f025), switchInnerDoubleWords); |
|
|
|
} |
|
|
|
|