Browse Source

Merge branch 'master' into webp

pull/1552/head
James Jackson-South 5 years ago
committed by GitHub
parent
commit
91bf3830c4
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 35
      src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
  2. 120
      src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
  3. 28
      src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
  4. 98
      src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
  5. 56
      tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
  6. 91
      tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
  7. 2
      tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs

35
src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs → src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs

@ -1,16 +1,17 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
using System.Runtime.CompilerServices;
using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
{
/// <summary>
/// Provides 8-bit lookup tables for converting from Rgb to YCbCr colorspace.
/// Methods to build the tables are based on libjpeg implementation.
/// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)!
/// </summary>
internal unsafe struct RgbToYCbCrTables
internal unsafe struct RgbToYCbCrConverterLut
{
/// <summary>
/// The red luminance table
@ -63,10 +64,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
/// <summary>
/// Initializes the YCbCr tables
/// </summary>
/// <returns>The initialized <see cref="RgbToYCbCrTables"/></returns>
public static RgbToYCbCrTables Create()
/// <returns>The initialized <see cref="RgbToYCbCrConverterLut"/></returns>
public static RgbToYCbCrConverterLut Create()
{
RgbToYCbCrTables tables = default;
RgbToYCbCrConverterLut tables = default;
for (int i = 0; i <= 255; i++)
{
@ -92,11 +93,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
}
/// <summary>
/// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)!
/// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void ConvertPixelInto(
private void ConvertPixelInto(
int r,
int g,
int b,
@ -111,10 +111,29 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
// float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
// float cr = MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero);
// float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
}
public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
{
ref Rgb24 rgbStart = ref rgbSpan[0];
for (int i = 0; i < 64; i++)
{
ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
this.ConvertPixelInto(
c.R,
c.G,
c.B,
ref yBlock,
ref cbBlock,
ref crBlock,
i);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Fix(float x)
=> (int)((x * (1L << ScaleBits)) + 0.5F);

120
src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs

@ -0,0 +1,120 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
using System.Diagnostics;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
{
internal static class RgbToYCbCrConverterVectorized
{
public static bool IsSupported
{
get
{
#if SUPPORTS_RUNTIME_INTRINSICS
return Avx2.IsSupported;
#else
return false;
#endif
}
}
#if SUPPORTS_RUNTIME_INTRINSICS
private static ReadOnlySpan<byte> MoveFirst24BytesToSeparateLanes => new byte[]
{
0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0,
3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0
};
private static ReadOnlySpan<byte> MoveLast24BytesToSeparateLanes => new byte[]
{
2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,
5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0
};
private static ReadOnlySpan<byte> ExtractRgb => new byte[]
{
0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF,
0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF
};
#endif
public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
{
Debug.Assert(IsSupported, "AVX2 is required to run this converter");
#if SUPPORTS_RUNTIME_INTRINSICS
var f0299 = Vector256.Create(0.299f);
var f0587 = Vector256.Create(0.587f);
var f0114 = Vector256.Create(0.114f);
var fn0168736 = Vector256.Create(-0.168736f);
var fn0331264 = Vector256.Create(-0.331264f);
var f128 = Vector256.Create(128f);
var fn0418688 = Vector256.Create(-0.418688f);
var fn0081312F = Vector256.Create(-0.081312F);
var f05 = Vector256.Create(0.5f);
var zero = Vector256.Create(0).AsByte();
ref Vector256<byte> inRef = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
ref Vector256<float> destYRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref yBlock);
ref Vector256<float> destCbRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock);
ref Vector256<float> destCrRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock);
var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
Vector256<byte> rgb, rg, bx;
Vector256<float> r, g, b;
for (int i = 0; i < 7; i++)
{
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.Shuffle(rgb, extractRgbMask);
rg = Avx2.UnpackLow(rgb, zero);
bx = Avx2.UnpackHigh(rgb, zero);
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
// (0.299F * r) + (0.587F * g) + (0.114F * b);
Unsafe.Add(ref destYRef, i) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
}
extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes));
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.Shuffle(rgb, extractRgbMask);
rg = Avx2.UnpackLow(rgb, zero);
bx = Avx2.UnpackHigh(rgb, zero);
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
// (0.299F * r) + (0.587F * g) + (0.114F * b);
Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
#endif
}
}
}

28
src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs

@ -2,7 +2,6 @@
// Licensed under the Apache License, Version 2.0.
using System;
using System.Runtime.CompilerServices;
using SixLabors.ImageSharp.Advanced;
using SixLabors.ImageSharp.PixelFormats;
@ -33,7 +32,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
/// <summary>
/// The color conversion tables
/// </summary>
private RgbToYCbCrTables colorTables;
private RgbToYCbCrConverterLut colorTables;
/// <summary>
/// Temporal 8x8 block to hold TPixel data
@ -48,7 +47,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
public static YCbCrForwardConverter<TPixel> Create()
{
var result = default(YCbCrForwardConverter<TPixel>);
result.colorTables = RgbToYCbCrTables.Create();
if (!RgbToYCbCrConverterVectorized.IsSupported)
{
// Avoid creating lookup tables, when vectorized converter is supported
result.colorTables = RgbToYCbCrConverterLut.Create();
}
return result;
}
@ -65,20 +69,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
ref Block8x8F yBlock = ref this.Y;
ref Block8x8F cbBlock = ref this.Cb;
ref Block8x8F crBlock = ref this.Cr;
ref Rgb24 rgbStart = ref rgbSpan[0];
for (int i = 0; i < 64; i++)
if (RgbToYCbCrConverterVectorized.IsSupported)
{
ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
this.colorTables.ConvertPixelInto(
c.R,
c.G,
c.B,
ref yBlock,
ref cbBlock,
ref crBlock,
i);
RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
}
else
{
this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
}
}
}

98
src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs

@ -4,6 +4,11 @@
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
namespace SixLabors.ImageSharp.Processing.Processors.Transforms
{
@ -66,21 +71,94 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms
[MethodImpl(InliningOptions.ShortMethod)]
public Vector4 ConvolveCore(ref Vector4 rowStartRef)
{
ref float horizontalValues = ref Unsafe.AsRef<float>(this.bufferPtr);
#if SUPPORTS_RUNTIME_INTRINSICS
if (Fma.IsSupported)
{
float* bufferStart = this.bufferPtr;
float* bufferEnd = bufferStart + (this.Length & ~3);
Vector256<float> result256_0 = Vector256<float>.Zero;
Vector256<float> result256_1 = Vector256<float>.Zero;
ReadOnlySpan<byte> maskBytes = new byte[]
{
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0,
};
Vector256<int> mask = Unsafe.ReadUnaligned<Vector256<int>>(ref MemoryMarshal.GetReference(maskBytes));
// Destination color components
Vector4 result = Vector4.Zero;
while (bufferStart < bufferEnd)
{
// It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
// for the FMA operation, and execute it directly on the target register and reading directly from
// memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
// The code below should compile in the following assembly on .NET 5 x64:
//
// vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _]
// vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
// vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0
//
// For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
// Additionally, we're also unrolling two computations per each loop iterations to leverage the
// fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
result256_0 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
result256_0);
for (int i = 0; i < this.Length; i++)
{
float weight = Unsafe.Add(ref horizontalValues, i);
result256_1 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)),
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask),
result256_1);
bufferStart += 4;
rowStartRef = ref Unsafe.Add(ref rowStartRef, 4);
}
result256_0 = Avx.Add(result256_0, result256_1);
if ((this.Length & 3) >= 2)
{
result256_0 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
result256_0);
bufferStart += 2;
rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
}
// Vector4 v = offsetedRowSpan[i];
Vector4 v = Unsafe.Add(ref rowStartRef, i);
result += v * weight;
Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());
if ((this.Length & 1) != 0)
{
result128 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef),
Vector128.Create(*bufferStart),
result128);
}
return *(Vector4*)&result128;
}
else
#endif
{
// Destination color components
Vector4 result = Vector4.Zero;
float* bufferStart = this.bufferPtr;
float* bufferEnd = this.bufferPtr + this.Length;
return result;
while (bufferStart < bufferEnd)
{
// Vector4 v = offsetedRowSpan[i];
result += rowStartRef * *bufferStart;
bufferStart++;
rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
}
return result;
}
}
/// <summary>

56
tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs

@ -0,0 +1,56 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
using BenchmarkDotNet.Attributes;
using SixLabors.ImageSharp.Formats.Jpeg.Components;
using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder;
using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder
{
public class YCbCrForwardConverterBenchmark
{
private RgbToYCbCrConverterLut converter;
private Rgb24[] data;
[GlobalSetup]
public void Setup()
{
this.converter = RgbToYCbCrConverterLut.Create();
var r = new Random(42);
this.data = new Rgb24[64];
var d = new byte[3];
for (int i = 0; i < this.data.Length; i++)
{
r.NextBytes(d);
this.data[i] = new Rgb24(d[0], d[1], d[2]);
}
}
[Benchmark(Baseline = true)]
public void ConvertLut()
{
Block8x8F y = default;
Block8x8F cb = default;
Block8x8F cr = default;
this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
}
[Benchmark]
public void ConvertVectorized()
{
Block8x8F y = default;
Block8x8F cb = default;
Block8x8F cr = default;
if (RgbToYCbCrConverterVectorized.IsSupported)
{
RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
}
}
}
}

91
tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs

@ -0,0 +1,91 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
using SixLabors.ImageSharp.ColorSpaces;
using SixLabors.ImageSharp.Formats.Jpeg.Components;
using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder;
using SixLabors.ImageSharp.PixelFormats;
using SixLabors.ImageSharp.Tests.Colorspaces.Conversion;
using Xunit;
using Xunit.Abstractions;
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Tests.Formats.Jpg
{
public class RgbToYCbCrConverterTests
{
public RgbToYCbCrConverterTests(ITestOutputHelper output)
{
this.Output = output;
}
private ITestOutputHelper Output { get; }
[Fact]
public void TestLutConverter()
{
Rgb24[] data = CreateTestData();
var target = RgbToYCbCrConverterLut.Create();
Block8x8F y = default;
Block8x8F cb = default;
Block8x8F cr = default;
target.Convert(data.AsSpan(), ref y, ref cb, ref cr);
Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F));
}
[Fact]
public void TestVectorizedConverter()
{
if (!RgbToYCbCrConverterVectorized.IsSupported)
{
this.Output.WriteLine("No AVX and/or FMA present, skipping test!");
return;
}
Rgb24[] data = CreateTestData();
Block8x8F y = default;
Block8x8F cb = default;
Block8x8F cr = default;
RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr);
Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F));
}
private static void Verify(ReadOnlySpan<Rgb24> data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer)
{
for (int i = 0; i < data.Length; i++)
{
int r = data[i].R;
int g = data[i].G;
int b = data[i].B;
float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
Assert.True(comparer.Equals(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y} == {yResult[i]}, {cb} == {cbResult[i]}, {cr} == {crResult[i]}");
}
}
private static Rgb24[] CreateTestData()
{
var data = new Rgb24[64];
var r = new Random();
var random = new byte[3];
for (int i = 0; i < data.Length; i++)
{
r.NextBytes(random);
data[i] = new Rgb24(random[0], random[1], random[2]);
}
return data;
}
}
}

2
tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeTests.cs

@ -139,7 +139,7 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Transforms
testOutputDetails: workingBufferLimitInRows,
appendPixelTypeToFileName: false);
image.CompareToReferenceOutput(
ImageComparer.TolerantPercentage(0.001f),
ImageComparer.TolerantPercentage(0.004f),
provider,
testOutputDetails: workingBufferLimitInRows,
appendPixelTypeToFileName: false);

Loading…
Cancel
Save