mirror of https://github.com/SixLabors/ImageSharp
11 changed files with 704 additions and 36 deletions
@ -0,0 +1,145 @@ |
|||||
|
// Copyright (c) Six Labors.
|
||||
|
// Licensed under the Apache License, Version 2.0.
|
||||
|
|
||||
|
using System; |
||||
|
using System.Numerics; |
||||
|
using System.Runtime.CompilerServices; |
||||
|
using System.Runtime.InteropServices; |
||||
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
||||
|
using System.Runtime.Intrinsics; |
||||
|
using System.Runtime.Intrinsics.X86; |
||||
|
using static SixLabors.ImageSharp.SimdUtils; |
||||
|
#else
|
||||
|
using SixLabors.ImageSharp.Tuples; |
||||
|
#endif
|
||||
|
|
||||
|
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
||||
|
{ |
||||
|
internal abstract partial class JpegColorConverter |
||||
|
{ |
||||
|
internal sealed class FromCmykVector8 : JpegColorConverter |
||||
|
{ |
||||
|
public FromCmykVector8(int precision) |
||||
|
: base(JpegColorSpace.Cmyk, precision) |
||||
|
{ |
||||
|
} |
||||
|
|
||||
|
public static bool IsAvailable => Vector.IsHardwareAccelerated && SimdUtils.HasVector8; |
||||
|
|
||||
|
public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result) |
||||
|
{ |
||||
|
int remainder = result.Length % 8; |
||||
|
int simdCount = result.Length - remainder; |
||||
|
if (simdCount > 0) |
||||
|
{ |
||||
|
ConvertCore(values.Slice(0, simdCount), result.Slice(0, simdCount), this.MaximumValue); |
||||
|
} |
||||
|
|
||||
|
FromCmykBasic.ConvertCore(values.Slice(simdCount, remainder), result.Slice(simdCount, remainder), this.MaximumValue); |
||||
|
} |
||||
|
|
||||
|
internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue) |
||||
|
{ |
||||
|
// This implementation is actually AVX specific.
|
||||
|
// An AVX register is capable of storing 8 float-s.
|
||||
|
if (!IsAvailable) |
||||
|
{ |
||||
|
throw new InvalidOperationException( |
||||
|
"JpegColorConverter.FromGrayscaleVector8 can be used only on architecture having 256 byte floating point SIMD registers!"); |
||||
|
} |
||||
|
|
||||
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
||||
|
ref Vector256<float> cBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
||||
|
ref Vector256<float> mBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
||||
|
ref Vector256<float> yBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
||||
|
ref Vector256<float> kBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3)); |
||||
|
|
||||
|
ref Vector256<float> resultBase = |
||||
|
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result)); |
||||
|
|
||||
|
// Used for the color conversion
|
||||
|
var scale = Vector256.Create(1 / maxValue); |
||||
|
var one = Vector256.Create(1F); |
||||
|
|
||||
|
// Used for packing
|
||||
|
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32); |
||||
|
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control); |
||||
|
|
||||
|
int n = result.Length / 8; |
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
Vector256<float> k = Avx2.PermuteVar8x32(Unsafe.Add(ref kBase, i), vcontrol); |
||||
|
Vector256<float> c = Avx2.PermuteVar8x32(Unsafe.Add(ref cBase, i), vcontrol); |
||||
|
Vector256<float> m = Avx2.PermuteVar8x32(Unsafe.Add(ref mBase, i), vcontrol); |
||||
|
Vector256<float> y = Avx2.PermuteVar8x32(Unsafe.Add(ref yBase, i), vcontrol); |
||||
|
|
||||
|
k = Avx.Multiply(k, scale); |
||||
|
|
||||
|
c = Avx.Multiply(Avx.Multiply(c, k), scale); |
||||
|
m = Avx.Multiply(Avx.Multiply(m, k), scale); |
||||
|
y = Avx.Multiply(Avx.Multiply(y, k), scale); |
||||
|
|
||||
|
Vector256<float> cmLo = Avx.UnpackLow(c, m); |
||||
|
Vector256<float> yoLo = Avx.UnpackLow(y, one); |
||||
|
Vector256<float> cmHi = Avx.UnpackHigh(c, m); |
||||
|
Vector256<float> yoHi = Avx.UnpackHigh(y, one); |
||||
|
|
||||
|
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4); |
||||
|
|
||||
|
destination = Avx.Shuffle(cmLo, yoLo, 0b01_00_01_00); |
||||
|
Unsafe.Add(ref destination, 1) = Avx.Shuffle(cmLo, yoLo, 0b11_10_11_10); |
||||
|
Unsafe.Add(ref destination, 2) = Avx.Shuffle(cmHi, yoHi, 0b01_00_01_00); |
||||
|
Unsafe.Add(ref destination, 3) = Avx.Shuffle(cmHi, yoHi, 0b11_10_11_10); |
||||
|
} |
||||
|
#else
|
||||
|
ref Vector<float> cBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
||||
|
ref Vector<float> mBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
||||
|
ref Vector<float> yBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
||||
|
ref Vector<float> kBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component3)); |
||||
|
|
||||
|
ref Vector4Octet resultBase = |
||||
|
ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result)); |
||||
|
|
||||
|
Vector4Pair cc = default; |
||||
|
Vector4Pair mm = default; |
||||
|
Vector4Pair yy = default; |
||||
|
ref Vector<float> ccRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref cc); |
||||
|
ref Vector<float> mmRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref mm); |
||||
|
ref Vector<float> yyRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref yy); |
||||
|
|
||||
|
var scale = new Vector<float>(1 / maxValue); |
||||
|
|
||||
|
// Walking 8 elements at one step:
|
||||
|
int n = result.Length / 8; |
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
Vector<float> c = Unsafe.Add(ref cBase, i); |
||||
|
Vector<float> m = Unsafe.Add(ref mBase, i); |
||||
|
Vector<float> y = Unsafe.Add(ref yBase, i); |
||||
|
Vector<float> k = Unsafe.Add(ref kBase, i) * scale; |
||||
|
|
||||
|
c = (c * k) * scale; |
||||
|
m = (m * k) * scale; |
||||
|
y = (y * k) * scale; |
||||
|
|
||||
|
ccRefAsVector = c; |
||||
|
mmRefAsVector = m; |
||||
|
yyRefAsVector = y; |
||||
|
|
||||
|
// Collect (c0,c1...c8) (m0,m1...m8) (y0,y1...y8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
|
||||
|
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); |
||||
|
destination.Pack(ref cc, ref mm, ref yy); |
||||
|
} |
||||
|
#endif
|
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,109 @@ |
|||||
|
// Copyright (c) Six Labors.
|
||||
|
// Licensed under the Apache License, Version 2.0.
|
||||
|
|
||||
|
using System; |
||||
|
using System.Numerics; |
||||
|
using System.Runtime.CompilerServices; |
||||
|
using System.Runtime.InteropServices; |
||||
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
||||
|
using System.Runtime.Intrinsics; |
||||
|
using System.Runtime.Intrinsics.X86; |
||||
|
using static SixLabors.ImageSharp.SimdUtils; |
||||
|
#else
|
||||
|
using SixLabors.ImageSharp.Tuples; |
||||
|
#endif
|
||||
|
|
||||
|
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
||||
|
{ |
||||
|
internal abstract partial class JpegColorConverter |
||||
|
{ |
||||
|
internal sealed class FromGrayscaleVector8 : JpegColorConverter |
||||
|
{ |
||||
|
public FromGrayscaleVector8(int precision) |
||||
|
: base(JpegColorSpace.Grayscale, precision) |
||||
|
{ |
||||
|
} |
||||
|
|
||||
|
public static bool IsAvailable => Vector.IsHardwareAccelerated && SimdUtils.HasVector8; |
||||
|
|
||||
|
public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result) |
||||
|
{ |
||||
|
int remainder = result.Length % 8; |
||||
|
int simdCount = result.Length - remainder; |
||||
|
if (simdCount > 0) |
||||
|
{ |
||||
|
ConvertCore(values.Slice(0, simdCount), result.Slice(0, simdCount), this.MaximumValue); |
||||
|
} |
||||
|
|
||||
|
FromGrayscaleBasic.ConvertCore(values.Slice(simdCount, remainder), result.Slice(simdCount, remainder), this.MaximumValue); |
||||
|
} |
||||
|
|
||||
|
internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue) |
||||
|
{ |
||||
|
// This implementation is actually AVX specific.
|
||||
|
// An AVX register is capable of storing 8 float-s.
|
||||
|
if (!IsAvailable) |
||||
|
{ |
||||
|
throw new InvalidOperationException( |
||||
|
"JpegColorConverter.FromGrayscaleVector8 can be used only on architecture having 256 byte floating point SIMD registers!"); |
||||
|
} |
||||
|
|
||||
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
||||
|
ref Vector256<float> gBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
||||
|
|
||||
|
ref Vector256<float> resultBase = |
||||
|
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result)); |
||||
|
|
||||
|
// Used for the color conversion
|
||||
|
var scale = Vector256.Create(1 / maxValue); |
||||
|
var one = Vector256.Create(1F); |
||||
|
|
||||
|
// Used for packing
|
||||
|
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32); |
||||
|
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control); |
||||
|
|
||||
|
int n = result.Length / 8; |
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
Vector256<float> g = Avx2.PermuteVar8x32(Unsafe.Add(ref gBase, i), vcontrol); |
||||
|
|
||||
|
g = Avx.Multiply(g, scale); |
||||
|
|
||||
|
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4); |
||||
|
|
||||
|
destination = Avx.Blend(Avx.Permute(g, 0b00_00_00_00), one, 0b1000_1000); |
||||
|
Unsafe.Add(ref destination, 1) = Avx.Blend(Avx.Permute(g, 0b01_01_01_01), one, 0b1000_1000); |
||||
|
Unsafe.Add(ref destination, 2) = Avx.Blend(Avx.Permute(g, 0b10_10_10_10), one, 0b1000_1000); |
||||
|
Unsafe.Add(ref destination, 3) = Avx.Blend(Avx.Permute(g, 0b11_11_11_11), one, 0b1000_1000); |
||||
|
} |
||||
|
#else
|
||||
|
ref Vector<float> gBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
||||
|
|
||||
|
ref Vector4Octet resultBase = |
||||
|
ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result)); |
||||
|
|
||||
|
Vector4Pair gg = default; |
||||
|
ref Vector<float> ggRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref gg); |
||||
|
|
||||
|
var scale = new Vector<float>(1 / maxValue); |
||||
|
|
||||
|
// Walking 8 elements at one step:
|
||||
|
int n = result.Length / 8; |
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
Vector<float> g = Unsafe.Add(ref gBase, i); |
||||
|
g *= scale; |
||||
|
|
||||
|
ggRefAsVector = g; |
||||
|
|
||||
|
// Collect (g0,g1...g7) vector values in the expected (g0,g0,g0,1), (g1,g1,g1,1) ... order:
|
||||
|
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); |
||||
|
destination.Pack(ref gg); |
||||
|
} |
||||
|
#endif
|
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,132 @@ |
|||||
|
// Copyright (c) Six Labors.
|
||||
|
// Licensed under the Apache License, Version 2.0.
|
||||
|
|
||||
|
using System; |
||||
|
using System.Numerics; |
||||
|
using System.Runtime.CompilerServices; |
||||
|
using System.Runtime.InteropServices; |
||||
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
||||
|
using System.Runtime.Intrinsics; |
||||
|
using System.Runtime.Intrinsics.X86; |
||||
|
using static SixLabors.ImageSharp.SimdUtils; |
||||
|
#else
|
||||
|
using SixLabors.ImageSharp.Tuples; |
||||
|
#endif
|
||||
|
|
||||
|
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
||||
|
{ |
||||
|
internal abstract partial class JpegColorConverter |
||||
|
{ |
||||
|
internal sealed class FromRgbVector8 : JpegColorConverter |
||||
|
{ |
||||
|
public FromRgbVector8(int precision) |
||||
|
: base(JpegColorSpace.RGB, precision) |
||||
|
{ |
||||
|
} |
||||
|
|
||||
|
public static bool IsAvailable => Vector.IsHardwareAccelerated && SimdUtils.HasVector8; |
||||
|
|
||||
|
public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result) |
||||
|
{ |
||||
|
int remainder = result.Length % 8; |
||||
|
int simdCount = result.Length - remainder; |
||||
|
if (simdCount > 0) |
||||
|
{ |
||||
|
ConvertCore(values.Slice(0, simdCount), result.Slice(0, simdCount), this.MaximumValue); |
||||
|
} |
||||
|
|
||||
|
FromRgbBasic.ConvertCore(values.Slice(simdCount, remainder), result.Slice(simdCount, remainder), this.MaximumValue); |
||||
|
} |
||||
|
|
||||
|
internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue) |
||||
|
{ |
||||
|
// This implementation is actually AVX specific.
|
||||
|
// An AVX register is capable of storing 8 float-s.
|
||||
|
if (!IsAvailable) |
||||
|
{ |
||||
|
throw new InvalidOperationException( |
||||
|
"JpegColorConverter.FromGrayscaleVector8 can be used only on architecture having 256 byte floating point SIMD registers!"); |
||||
|
} |
||||
|
|
||||
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
||||
|
ref Vector256<float> rBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
||||
|
ref Vector256<float> gBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
||||
|
ref Vector256<float> bBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
||||
|
|
||||
|
ref Vector256<float> resultBase = |
||||
|
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result)); |
||||
|
|
||||
|
// Used for the color conversion
|
||||
|
var scale = Vector256.Create(1 / maxValue); |
||||
|
var one = Vector256.Create(1F); |
||||
|
|
||||
|
// Used for packing
|
||||
|
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32); |
||||
|
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control); |
||||
|
|
||||
|
int n = result.Length / 8; |
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
Vector256<float> r = Avx.Multiply(Avx2.PermuteVar8x32(Unsafe.Add(ref rBase, i), vcontrol), scale); |
||||
|
Vector256<float> g = Avx.Multiply(Avx2.PermuteVar8x32(Unsafe.Add(ref gBase, i), vcontrol), scale); |
||||
|
Vector256<float> b = Avx.Multiply(Avx2.PermuteVar8x32(Unsafe.Add(ref bBase, i), vcontrol), scale); |
||||
|
|
||||
|
Vector256<float> rgLo = Avx.UnpackLow(r, g); |
||||
|
Vector256<float> boLo = Avx.UnpackLow(b, one); |
||||
|
Vector256<float> rgHi = Avx.UnpackHigh(r, g); |
||||
|
Vector256<float> boHi = Avx.UnpackHigh(b, one); |
||||
|
|
||||
|
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4); |
||||
|
|
||||
|
destination = Avx.Shuffle(rgLo, boLo, 0b01_00_01_00); |
||||
|
Unsafe.Add(ref destination, 1) = Avx.Shuffle(rgLo, boLo, 0b11_10_11_10); |
||||
|
Unsafe.Add(ref destination, 2) = Avx.Shuffle(rgHi, boHi, 0b01_00_01_00); |
||||
|
Unsafe.Add(ref destination, 3) = Avx.Shuffle(rgHi, boHi, 0b11_10_11_10); |
||||
|
} |
||||
|
#else
|
||||
|
ref Vector<float> rBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
||||
|
ref Vector<float> gBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
||||
|
ref Vector<float> bBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
||||
|
|
||||
|
ref Vector4Octet resultBase = |
||||
|
ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result)); |
||||
|
|
||||
|
Vector4Pair rr = default; |
||||
|
Vector4Pair gg = default; |
||||
|
Vector4Pair bb = default; |
||||
|
ref Vector<float> rrRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref rr); |
||||
|
ref Vector<float> ggRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref gg); |
||||
|
ref Vector<float> bbRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref bb); |
||||
|
|
||||
|
var scale = new Vector<float>(1 / maxValue); |
||||
|
|
||||
|
// Walking 8 elements at one step:
|
||||
|
int n = result.Length / 8; |
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
Vector<float> r = Unsafe.Add(ref rBase, i); |
||||
|
Vector<float> g = Unsafe.Add(ref gBase, i); |
||||
|
Vector<float> b = Unsafe.Add(ref bBase, i); |
||||
|
r *= scale; |
||||
|
g *= scale; |
||||
|
b *= scale; |
||||
|
|
||||
|
rrRefAsVector = r; |
||||
|
ggRefAsVector = g; |
||||
|
bbRefAsVector = b; |
||||
|
|
||||
|
// Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
|
||||
|
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); |
||||
|
destination.Pack(ref rr, ref gg, ref bb); |
||||
|
} |
||||
|
#endif
|
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,193 @@ |
|||||
|
// Copyright (c) Six Labors.
|
||||
|
// Licensed under the Apache License, Version 2.0.
|
||||
|
|
||||
|
using System; |
||||
|
using System.Numerics; |
||||
|
using System.Runtime.CompilerServices; |
||||
|
using System.Runtime.InteropServices; |
||||
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
||||
|
using System.Runtime.Intrinsics; |
||||
|
using System.Runtime.Intrinsics.X86; |
||||
|
using static SixLabors.ImageSharp.SimdUtils; |
||||
|
#else
|
||||
|
using SixLabors.ImageSharp.Tuples; |
||||
|
#endif
|
||||
|
|
||||
|
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters |
||||
|
{ |
||||
|
internal abstract partial class JpegColorConverter |
||||
|
{ |
||||
|
internal sealed class FromYccKVector8 : JpegColorConverter |
||||
|
{ |
||||
|
public FromYccKVector8(int precision) |
||||
|
: base(JpegColorSpace.Ycck, precision) |
||||
|
{ |
||||
|
} |
||||
|
|
||||
|
public static bool IsAvailable => Vector.IsHardwareAccelerated && SimdUtils.HasVector8; |
||||
|
|
||||
|
public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result) |
||||
|
{ |
||||
|
int remainder = result.Length % 8; |
||||
|
int simdCount = result.Length - remainder; |
||||
|
if (simdCount > 0) |
||||
|
{ |
||||
|
ConvertCore(values.Slice(0, simdCount), result.Slice(0, simdCount), this.MaximumValue, this.HalfValue); |
||||
|
} |
||||
|
|
||||
|
FromYccKBasic.ConvertCore(values.Slice(simdCount, remainder), result.Slice(simdCount, remainder), this.MaximumValue, this.HalfValue); |
||||
|
} |
||||
|
|
||||
|
internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue, float halfValue) |
||||
|
{ |
||||
|
// This implementation is actually AVX specific.
|
||||
|
// An AVX register is capable of storing 8 float-s.
|
||||
|
if (!IsAvailable) |
||||
|
{ |
||||
|
throw new InvalidOperationException( |
||||
|
"JpegColorConverter.FromYCbCrSimd256 can be used only on architecture having 256 byte floating point SIMD registers!"); |
||||
|
} |
||||
|
|
||||
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
||||
|
ref Vector256<float> yBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
||||
|
ref Vector256<float> cbBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
||||
|
ref Vector256<float> crBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
||||
|
ref Vector256<float> kBase = |
||||
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3)); |
||||
|
|
||||
|
ref Vector256<float> resultBase = |
||||
|
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result)); |
||||
|
|
||||
|
// Used for the color conversion
|
||||
|
var chromaOffset = Vector256.Create(-halfValue); |
||||
|
var scale = Vector256.Create(1 / maxValue); |
||||
|
var max = Vector256.Create(maxValue); |
||||
|
var rCrMult = Vector256.Create(1.402F); |
||||
|
var gCbMult = Vector256.Create(-0.344136F); |
||||
|
var gCrMult = Vector256.Create(-0.714136F); |
||||
|
var bCbMult = Vector256.Create(1.772F); |
||||
|
|
||||
|
// Used for packing.
|
||||
|
var va = Vector256.Create(1F); |
||||
|
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32); |
||||
|
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control); |
||||
|
|
||||
|
// Walking 8 elements at one step:
|
||||
|
int n = result.Length / 8; |
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
// y = yVals[i];
|
||||
|
// cb = cbVals[i] - 128F;
|
||||
|
// cr = crVals[i] - 128F;
|
||||
|
// k = kVals[i] / 256F;
|
||||
|
Vector256<float> y = Unsafe.Add(ref yBase, i); |
||||
|
Vector256<float> cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset); |
||||
|
Vector256<float> cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset); |
||||
|
Vector256<float> k = Avx.Divide(Unsafe.Add(ref kBase, i), max); |
||||
|
|
||||
|
y = Avx2.PermuteVar8x32(y, vcontrol); |
||||
|
cb = Avx2.PermuteVar8x32(cb, vcontrol); |
||||
|
cr = Avx2.PermuteVar8x32(cr, vcontrol); |
||||
|
k = Avx2.PermuteVar8x32(k, vcontrol); |
||||
|
|
||||
|
// r = y + (1.402F * cr);
|
||||
|
// g = y - (0.344136F * cb) - (0.714136F * cr);
|
||||
|
// b = y + (1.772F * cb);
|
||||
|
// Adding & multiplying 8 elements at one time:
|
||||
|
Vector256<float> r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult); |
||||
|
Vector256<float> g = HwIntrinsics.MultiplyAdd(HwIntrinsics.MultiplyAdd(y, cb, gCbMult), cr, gCrMult); |
||||
|
Vector256<float> b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult); |
||||
|
|
||||
|
r = Avx.Subtract(max, Avx.RoundToNearestInteger(r)); |
||||
|
g = Avx.Subtract(max, Avx.RoundToNearestInteger(g)); |
||||
|
b = Avx.Subtract(max, Avx.RoundToNearestInteger(b)); |
||||
|
|
||||
|
r = Avx.Multiply(Avx.Multiply(r, k), scale); |
||||
|
g = Avx.Multiply(Avx.Multiply(g, k), scale); |
||||
|
b = Avx.Multiply(Avx.Multiply(b, k), scale); |
||||
|
|
||||
|
Vector256<float> vte = Avx.UnpackLow(r, b); |
||||
|
Vector256<float> vto = Avx.UnpackLow(g, va); |
||||
|
|
||||
|
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4); |
||||
|
|
||||
|
destination = Avx.UnpackLow(vte, vto); |
||||
|
Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto); |
||||
|
|
||||
|
vte = Avx.UnpackHigh(r, b); |
||||
|
vto = Avx.UnpackHigh(g, va); |
||||
|
|
||||
|
Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto); |
||||
|
Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto); |
||||
|
} |
||||
|
#else
|
||||
|
ref Vector<float> yBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0)); |
||||
|
ref Vector<float> cbBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component1)); |
||||
|
ref Vector<float> crBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component2)); |
||||
|
ref Vector<float> kBase = |
||||
|
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component3)); |
||||
|
|
||||
|
ref Vector4Octet resultBase = |
||||
|
ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result)); |
||||
|
|
||||
|
var chromaOffset = new Vector<float>(-halfValue); |
||||
|
|
||||
|
// Walking 8 elements at one step:
|
||||
|
int n = result.Length / 8; |
||||
|
|
||||
|
Vector4Pair rr = default; |
||||
|
Vector4Pair gg = default; |
||||
|
Vector4Pair bb = default; |
||||
|
|
||||
|
ref Vector<float> rrRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref rr); |
||||
|
ref Vector<float> ggRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref gg); |
||||
|
ref Vector<float> bbRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref bb); |
||||
|
|
||||
|
var scale = new Vector<float>(1 / maxValue); |
||||
|
var max = new Vector<float>(maxValue); |
||||
|
|
||||
|
for (int i = 0; i < n; i++) |
||||
|
{ |
||||
|
// y = yVals[i];
|
||||
|
// cb = cbVals[i] - 128F;
|
||||
|
// cr = crVals[i] - 128F;
|
||||
|
// k = kVals[i] / 256F;
|
||||
|
Vector<float> y = Unsafe.Add(ref yBase, i); |
||||
|
Vector<float> cb = Unsafe.Add(ref cbBase, i) + chromaOffset; |
||||
|
Vector<float> cr = Unsafe.Add(ref crBase, i) + chromaOffset; |
||||
|
Vector<float> k = Unsafe.Add(ref kBase, i) / max; |
||||
|
|
||||
|
// r = y + (1.402F * cr);
|
||||
|
// g = y - (0.344136F * cb) - (0.714136F * cr);
|
||||
|
// b = y + (1.772F * cb);
|
||||
|
// Adding & multiplying 8 elements at one time:
|
||||
|
Vector<float> r = y + (cr * new Vector<float>(1.402F)); |
||||
|
Vector<float> g = y - (cb * new Vector<float>(0.344136F)) - (cr * new Vector<float>(0.714136F)); |
||||
|
Vector<float> b = y + (cb * new Vector<float>(1.772F)); |
||||
|
|
||||
|
r = (max - r.FastRound()) * k; |
||||
|
g = (max - g.FastRound()) * k; |
||||
|
b = (max - b.FastRound()) * k; |
||||
|
r *= scale; |
||||
|
g *= scale; |
||||
|
b *= scale; |
||||
|
|
||||
|
rrRefAsVector = r; |
||||
|
ggRefAsVector = g; |
||||
|
bbRefAsVector = b; |
||||
|
|
||||
|
// Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
|
||||
|
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); |
||||
|
destination.Pack(ref rr, ref gg, ref bb); |
||||
|
} |
||||
|
#endif
|
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue