Browse Source

Faster Pad3Shuffle4

js/color-alpha-handling
James Jackson-South 6 years ago
parent
commit
d933ed6480
  1. 58
      src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
  2. 2
      src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
  3. 34
      tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs
  4. 4
      tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs

58
src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

@ -101,20 +101,20 @@ namespace SixLabors.ImageSharp
{
if (Ssse3.IsSupported)
{
int remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count);
int remainder = source.Length % (Vector128<byte>.Count * 3);
int adjustedCount = source.Length - remainder;
int sourceSlice = (int)(adjustedCount * (3 / 4F));
int sourceCount = source.Length - remainder;
int destCount = (int)(sourceCount * (4 / 3D));
if (adjustedCount > 0)
if (sourceCount > 0)
{
Pad3Shuffle4(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount),
source.Slice(0, sourceCount),
dest.Slice(0, destCount),
control);
source = source.Slice(sourceSlice);
dest = dest.Slice(adjustedCount);
source = source.Slice(sourceCount);
dest = dest.Slice(destCount);
}
}
}
@ -320,31 +320,37 @@ namespace SixLabors.ImageSharp
{
if (Ssse3.IsSupported)
{
Vector128<byte> wMask = Vector128.Create(0xff000000u).AsByte();
Vector128<byte> padMask = Vector128.Create(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1).AsByte();
Vector128<byte> vmask = Vector128.Create(0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80).AsByte();
Vector128<byte> vfill = Vector128.Create(0xff000000ff000000ul).AsByte();
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count];
Shuffle.MmShuffleSpan(ref bytes, control);
Vector128<byte> vcm = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes));
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes));
fixed (byte* sBase = source)
fixed (byte* dBase = dest)
ref Vector128<byte> sourceBase =
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source));
ref Vector128<byte> destBase =
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest));
int n = source.Length / Vector128<byte>.Count;
for (int i = 0, j = 0; i < n; i += 3, j += 4)
{
byte* s = sBase;
byte* d = dBase;
ref Vector128<byte> v0 = ref Unsafe.Add(ref sourceBase, i);
Vector128<byte> v1 = Unsafe.Add(ref v0, 1);
Vector128<byte> v2 = Unsafe.Add(ref v0, 2);
Vector128<byte> v3 = Sse2.ShiftRightLogical128BitLane(v2, 4);
// TODO: Consider unrolling and shuffling 4 at a time using Ssse3.AlignRight
// See https://stackoverflow.com/questions/2973708/fast-24-bit-array-32-bit-array-conversion
for (int i = 0; i < source.Length; i += 16)
{
Vector128<byte> vs0 = Sse2.LoadVector128(s);
Vector128<byte> val = Sse2.Or(wMask, Ssse3.Shuffle(vs0, padMask));
val = Ssse3.Shuffle(val, vcm);
Sse2.Store(d, val);
v2 = Ssse3.AlignRight(v2, v1, 8);
v1 = Ssse3.AlignRight(v1, v0, 12);
s += 12;
d += 16;
}
ref Vector128<byte> vd = ref Unsafe.Add(ref destBase, j);
vd = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v0, vmask), vfill), vshuffle);
Unsafe.Add(ref vd, 1) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v1, vmask), vfill), vshuffle);
Unsafe.Add(ref vd, 2) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v2, vmask), vfill), vshuffle);
Unsafe.Add(ref vd, 3) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v3, vmask), vfill), vshuffle);
}
}
}

2
src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs

@ -183,7 +183,7 @@ namespace SixLabors.ImageSharp
"Output span must be divisable by 4!");
DebugGuard.IsTrue(
source.Length == (int)(dest.Length * 3 / 4F),
source.Length == (int)(dest.Length * 3 / 4D),
nameof(source),
"Input span must be 3/4 the length of the output span!");
}

34
tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs

@ -44,21 +44,21 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
//
// Runtime=.NET Core 3.1
//
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
// |------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:|
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 62.91 ns | 1.240 ns | 1.569 ns | 1.00 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 2. AVX | Empty | 96 | 44.34 ns | 0.371 ns | 0.329 ns | 0.70 | 0.02 | - | - | - | - |
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 44.46 ns | 0.617 ns | 0.515 ns | 0.70 | 0.02 | - | - | - | - |
// | | | | | | | | | | | | | |
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 247.93 ns | 2.640 ns | 2.470 ns | 1.00 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 2. AVX | Empty | 384 | 92.91 ns | 1.204 ns | 1.127 ns | 0.37 | 0.01 | - | - | - | - |
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 91.42 ns | 1.234 ns | 1.094 ns | 0.37 | 0.01 | - | - | - | - |
// | | | | | | | | | | | | | |
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 444.79 ns | 5.094 ns | 4.254 ns | 1.00 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 2. AVX | Empty | 768 | 162.92 ns | 1.046 ns | 0.873 ns | 0.37 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 166.22 ns | 1.728 ns | 1.443 ns | 0.37 | 0.00 | - | - | - | - |
// | | | | | | | | | | | | | |
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 882.51 ns | 6.936 ns | 5.792 ns | 1.00 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 309.72 ns | 3.777 ns | 3.533 ns | 0.35 | 0.01 | - | - | - | - |
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 323.18 ns | 4.079 ns | 3.816 ns | 0.37 | 0.00 | - | - | - | - |
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
// |------------- |------------------- |-------------------------------------------------- |------ |----------:|----------:|---------:|------:|--------:|------:|------:|------:|----------:|
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 71.84 ns | 1.491 ns | 3.146 ns | 1.00 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 2. AVX | Empty | 96 | 23.14 ns | 0.186 ns | 0.165 ns | 0.33 | 0.02 | - | - | - | - |
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 22.94 ns | 0.127 ns | 0.112 ns | 0.33 | 0.02 | - | - | - | - |
// | | | | | | | | | | | | | |
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 230.90 ns | 0.877 ns | 0.777 ns | 1.00 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 2. AVX | Empty | 384 | 39.54 ns | 0.601 ns | 0.533 ns | 0.17 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 39.88 ns | 0.657 ns | 0.548 ns | 0.17 | 0.00 | - | - | - | - |
// | | | | | | | | | | | | | |
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 451.67 ns | 2.932 ns | 2.448 ns | 1.00 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 2. AVX | Empty | 768 | 60.79 ns | 1.200 ns | 1.561 ns | 0.13 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 62.30 ns | 0.469 ns | 0.416 ns | 0.14 | 0.00 | - | - | - | - |
// | | | | | | | | | | | | | |
// | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 896.11 ns | 10.358 ns | 9.689 ns | 1.00 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 109.72 ns | 2.149 ns | 2.300 ns | 0.12 | 0.00 | - | - | - | - |
// | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 108.70 ns | 0.994 ns | 0.881 ns | 0.12 | 0.00 | - | - | - | - |
}

4
tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs

@ -212,7 +212,7 @@ namespace SixLabors.ImageSharp.Tests.Common
byte[] source = new byte[count];
new Random(count).NextBytes(source);
var result = new byte[(int)(count * (4 / 3F))];
var result = new byte[(int)(count * (4 / 3D))];
byte[] expected = new byte[result.Length];
@ -249,7 +249,7 @@ namespace SixLabors.ImageSharp.Tests.Common
byte[] source = new byte[count];
new Random(count).NextBytes(source);
var result = new byte[(int)(count * (3 / 4F))];
var result = new byte[(int)(count * (3 / 4D))];
byte[] expected = new byte[result.Length];

Loading…
Cancel
Save