diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
similarity index 79%
rename from src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs
rename to src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 236eff27c..835a34f65 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -1,16 +1,17 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
+using System;
using System.Runtime.CompilerServices;
+using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
{
///
/// Provides 8-bit lookup tables for converting from Rgb to YCbCr colorspace.
/// Methods to build the tables are based on libjpeg implementation.
- /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)!
///
- internal unsafe struct RgbToYCbCrTables
+ internal unsafe struct RgbToYCbCrConverterLut
{
///
/// The red luminance table
@@ -63,10 +64,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
///
/// Initializes the YCbCr tables
///
- /// The initialized
- public static RgbToYCbCrTables Create()
+ /// The initialized
+ public static RgbToYCbCrConverterLut Create()
{
- RgbToYCbCrTables tables = default;
+ RgbToYCbCrConverterLut tables = default;
for (int i = 0; i <= 255; i++)
{
@@ -92,11 +93,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
}
///
- /// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)!
/// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values.
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- public void ConvertPixelInto(
+ private void ConvertPixelInto(
int r,
int g,
int b,
@@ -111,10 +111,29 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
// float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
- // float cr = MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero);
+ // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
}
+ public void Convert(Span rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+ {
+ ref Rgb24 rgbStart = ref rgbSpan[0];
+
+ for (int i = 0; i < 64; i++)
+ {
+ ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
+
+ this.ConvertPixelInto(
+ c.R,
+ c.G,
+ c.B,
+ ref yBlock,
+ ref cbBlock,
+ ref crBlock,
+ i);
+ }
+ }
+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Fix(float x)
=> (int)((x * (1L << ScaleBits)) + 0.5F);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
new file mode 100644
index 000000000..068c3db96
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -0,0 +1,182 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Diagnostics;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+ internal static class RgbToYCbCrConverterVectorized
+ {
+ private static ReadOnlySpan ExtractionMasks => new byte[]
+ {
+ 0x0, 0xFF, 0xFF, 0xFF, 0x1, 0xFF, 0xFF, 0xFF, 0x2, 0xFF, 0xFF, 0xFF, 0x3, 0xFF, 0xFF, 0xFF, 0x10, 0xFF, 0xFF, 0xFF, 0x11, 0xFF, 0xFF, 0xFF, 0x12, 0xFF, 0xFF, 0xFF, 0x13, 0xFF, 0xFF, 0xFF,
+ 0x4, 0xFF, 0xFF, 0xFF, 0x5, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF, 0x14, 0xFF, 0xFF, 0xFF, 0x15, 0xFF, 0xFF, 0xFF, 0x16, 0xFF, 0xFF, 0xFF, 0x17, 0xFF, 0xFF, 0xFF,
+ 0x8, 0xFF, 0xFF, 0xFF, 0x9, 0xFF, 0xFF, 0xFF, 0xA, 0xFF, 0xFF, 0xFF, 0xB, 0xFF, 0xFF, 0xFF, 0x18, 0xFF, 0xFF, 0xFF, 0x19, 0xFF, 0xFF, 0xFF, 0x1A, 0xFF, 0xFF, 0xFF, 0x1B, 0xFF, 0xFF, 0xFF,
+ 0xC, 0xFF, 0xFF, 0xFF, 0xD, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF, 0x1C, 0xFF, 0xFF, 0xFF, 0x1D, 0xFF, 0xFF, 0xFF, 0x1E, 0xFF, 0xFF, 0xFF, 0x1F, 0xFF, 0xFF, 0xFF,
+ };
+
+ public static bool IsSupported
+ {
+ get
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ return Avx2.IsSupported && Fma.IsSupported;
+#else
+ return false;
+#endif
+ }
+ }
+
+ public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+ {
+ Debug.Assert(IsSupported, "AVX2 and FMA are required to run this converter");
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ SeparateRgb(rgbSpan);
+ ConvertInternal(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+#endif
+ }
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ ///
+ /// Rearranges the provided in-place
+ /// from { r00, g00, b00, ..., r63, g63, b63 }
+ /// to { r00, ... r31, g00, ..., g31, b00, ..., b31,
+ /// r32, ... r63, g32, ..., g63, b31, ..., b63 }
+ ///
+ ///
+ /// SSE is used for this operation as it is significantly faster than AVX in this specific case.
+ /// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers.
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void SeparateRgb(ReadOnlySpan rgbSpan)
+ {
+ var selectRed0 = Vector128.Create(0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ var selectRed1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ var selectRed2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D);
+
+ var selectGreen0 = Vector128.Create(0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ var selectGreen1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ var selectGreen2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E);
+
+ var selectBlue0 = Vector128.Create(0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ var selectBlue1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+ var selectBlue2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F);
+
+ for (int i = 0; i < 2; i++)
+ {
+ ref Vector128 inRef = ref Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)), i * 6);
+
+ Vector128 in0 = inRef;
+ Vector128 in1 = Unsafe.Add(ref inRef, 1);
+ Vector128 in2 = Unsafe.Add(ref inRef, 2);
+
+ Vector128 r0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2));
+ Vector128 g0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2));
+ Vector128 b0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2));
+
+ in0 = Unsafe.Add(ref inRef, 3);
+ in1 = Unsafe.Add(ref inRef, 4);
+ in2 = Unsafe.Add(ref inRef, 5);
+
+ Vector128 r1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2));
+ Vector128 g1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2));
+ Vector128 b1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2));
+
+ inRef = r0;
+ Unsafe.Add(ref inRef, 1) = r1;
+ Unsafe.Add(ref inRef, 2) = g0;
+ Unsafe.Add(ref inRef, 3) = g1;
+ Unsafe.Add(ref inRef, 4) = b0;
+ Unsafe.Add(ref inRef, 5) = b1;
+ }
+ }
+
+ ///
+ /// Converts the previously separated (see ) RGB values to YCbCr using AVX2 and FMA.
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void ConvertInternal(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+ {
+ var f0299 = Vector256.Create(0.299f);
+ var f0587 = Vector256.Create(0.587f);
+ var f0114 = Vector256.Create(0.114f);
+ var fn0168736 = Vector256.Create(-0.168736f);
+ var fn0331264 = Vector256.Create(-0.331264f);
+ var f128 = Vector256.Create(128f);
+ var fn0418688 = Vector256.Create(-0.418688f);
+ var fn0081312F = Vector256.Create(-0.081312F);
+ var f05 = Vector256.Create(0.5f);
+
+ ref Vector256 inRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan));
+
+ for (int i = 0; i < 2; i++)
+ {
+ ref Vector256 destYRef = ref Unsafe.Add(ref Unsafe.As>(ref yBlock), i * 4);
+ ref Vector256 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), i * 4);
+ ref Vector256 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), i * 4);
+
+ Vector256 red = Unsafe.Add(ref inRef, i * 3);
+ Vector256 green = Unsafe.Add(ref inRef, (i * 3) + 1);
+ Vector256 blue = Unsafe.Add(ref inRef, (i * 3) + 2);
+
+ for (int j = 0; j < 2; j++)
+ {
+ // 1st part of unrolled loop
+ Vector256 mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), j * 2);
+
+ Vector256 r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32());
+ Vector256 g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32());
+ Vector256 b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32());
+
+ // (0.299F * r) + (0.587F * g) + (0.114F * b);
+ Vector256 yy0 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b)));
+
+ // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+ Vector256 cb0 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b))));
+
+ // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+ Vector256 cr0 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b))));
+
+ // 2nd part of unrolled loop
+ mask = Unsafe.Add(ref Unsafe.As>(ref MemoryMarshal.GetReference(ExtractionMasks)), (j * 2) + 1);
+
+ r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32());
+ g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32());
+ b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32());
+
+ // (0.299F * r) + (0.587F * g) + (0.114F * b);
+ Vector256 yy1 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b)));
+
+ // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+ Vector256 cb1 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b))));
+
+ // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+ Vector256 cr1 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b))));
+
+ // store results from 1st and 2nd part
+ Vector256 tmpY = Avx.Permute2x128(yy0, yy1, 0b0010_0001);
+ Unsafe.Add(ref destYRef, j) = Avx.Blend(yy0, tmpY, 0b1111_0000);
+ Unsafe.Add(ref destYRef, j + 2) = Avx.Blend(yy1, tmpY, 0b0000_1111);
+
+ Vector256 tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001);
+ Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000);
+ Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb0, tmpCb, 0b0000_1111);
+
+ Vector256 tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001);
+ Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000);
+ Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr0, tmpCr, 0b0000_1111);
+ }
+ }
+ }
+#endif
+ }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index 4d6186e22..b65899327 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -2,7 +2,6 @@
// Licensed under the Apache License, Version 2.0.
using System;
-using System.Runtime.CompilerServices;
using SixLabors.ImageSharp.Advanced;
using SixLabors.ImageSharp.PixelFormats;
@@ -33,7 +32,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
///
/// The color conversion tables
///
- private RgbToYCbCrTables colorTables;
+ private RgbToYCbCrConverterLut colorTables;
///
/// Temporal 8x8 block to hold TPixel data
@@ -48,7 +47,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
public static YCbCrForwardConverter Create()
{
var result = default(YCbCrForwardConverter);
- result.colorTables = RgbToYCbCrTables.Create();
+ if (RgbToYCbCrConverterVectorized.IsSupported)
+ {
+ // Avoid creating lookup tables, when vectorized converter is supported
+ result.colorTables = RgbToYCbCrConverterLut.Create();
+ }
+
return result;
}
@@ -65,20 +69,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
ref Block8x8F yBlock = ref this.Y;
ref Block8x8F cbBlock = ref this.Cb;
ref Block8x8F crBlock = ref this.Cr;
- ref Rgb24 rgbStart = ref rgbSpan[0];
- for (int i = 0; i < 64; i++)
+ if (RgbToYCbCrConverterVectorized.IsSupported)
{
- ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
-
- this.colorTables.ConvertPixelInto(
- c.R,
- c.G,
- c.B,
- ref yBlock,
- ref cbBlock,
- ref crBlock,
- i);
+ RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+ }
+ else
+ {
+ this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
}
}
}
diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
new file mode 100644
index 000000000..1db407293
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
@@ -0,0 +1,56 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.Formats.Jpeg.Components;
+using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder
+{
+ public class YCbCrForwardConverterBenchmark
+ {
+ private RgbToYCbCrConverterLut converter;
+ private Rgb24[] data;
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ this.converter = RgbToYCbCrConverterLut.Create();
+
+ var r = new Random(42);
+ this.data = new Rgb24[64];
+
+ var d = new byte[3];
+ for (int i = 0; i < this.data.Length; i++)
+ {
+ r.NextBytes(d);
+ this.data[i] = new Rgb24(d[0], d[1], d[2]);
+ }
+ }
+
+ [Benchmark(Baseline = true)]
+ public void ConvertLut()
+ {
+ Block8x8F y = default;
+ Block8x8F cb = default;
+ Block8x8F cr = default;
+
+ this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
+ }
+
+ [Benchmark]
+ public void ConvertVectorized()
+ {
+ Block8x8F y = default;
+ Block8x8F cb = default;
+ Block8x8F cr = default;
+
+ if (RgbToYCbCrConverterVectorized.IsSupported)
+ {
+ RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
+ }
+ }
+ }
+}