Browse Source

simplify Rgba32.PixelOperations, include benchmark results

pull/742/head
Anton Firszov 7 years ago
parent
commit
2fcda3cee0
  1. 60
      src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs
  2. 123
      src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
  3. 41
      tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
  4. 32
      tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs

60
src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs

@ -29,7 +29,19 @@ namespace SixLabors.ImageSharp.PixelFormats
/// <param name="count">The number of pixels to convert.</param>
internal virtual void PackFromVector4(ReadOnlySpan<Vector4> sourceVectors, Span<TPixel> destinationColors, int count)
{
PackFromVector4Common(sourceVectors, destinationColors, count);
ReadOnlySpan<Vector4> sourceVectors1 = sourceVectors;
Span<TPixel> destinationColors1 = destinationColors;
GuardSpans(sourceVectors1, nameof(sourceVectors1), destinationColors1, nameof(destinationColors1), count);
ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors1);
ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors1);
for (int i = 0; i < count; i++)
{
ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i);
ref TPixel dp = ref Unsafe.Add(ref destRef, i);
dp.PackFromVector4(sp);
}
}
/// <summary>
@ -40,7 +52,19 @@ namespace SixLabors.ImageSharp.PixelFormats
/// <param name="count">The number of pixels to convert.</param>
internal virtual void ToVector4(ReadOnlySpan<TPixel> sourceColors, Span<Vector4> destinationVectors, int count)
{
ToVector4Common(sourceColors, destinationVectors, count);
ReadOnlySpan<TPixel> sourceColors1 = sourceColors;
Span<Vector4> destinationVectors1 = destinationVectors;
GuardSpans(sourceColors1, nameof(sourceColors1), destinationVectors1, nameof(destinationVectors1), count);
ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors1);
ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors1);
for (int i = 0; i < count; i++)
{
ref TPixel sp = ref Unsafe.Add(ref sourceRef, i);
ref Vector4 dp = ref Unsafe.Add(ref destRef, i);
dp = sp.ToVector4();
}
}
/// <summary>
@ -106,37 +130,5 @@ namespace SixLabors.ImageSharp.PixelFormats
Guard.MustBeSizedAtLeast(source, minLength, sourceParamName);
Guard.MustBeSizedAtLeast(destination, minLength, destinationParamName);
}
[MethodImpl(InliningOptions.ShortMethod)]
internal static void PackFromVector4Common(ReadOnlySpan<Vector4> sourceVectors, Span<TPixel> destinationColors, int count)
{
GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count);
ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors);
ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors);
for (int i = 0; i < count; i++)
{
ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i);
ref TPixel dp = ref Unsafe.Add(ref destRef, i);
dp.PackFromVector4(sp);
}
}
[MethodImpl(InliningOptions.ShortMethod)]
internal static void ToVector4Common(ReadOnlySpan<TPixel> sourceColors, Span<Vector4> destinationVectors, int count)
{
GuardSpans(sourceColors, nameof(sourceColors), destinationVectors, nameof(destinationVectors), count);
ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors);
ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors);
for (int i = 0; i < count; i++)
{
ref TPixel sp = ref Unsafe.Add(ref sourceRef, i);
ref Vector4 dp = ref Unsafe.Add(ref destRef, i);
dp = sp.ToVector4();
}
}
}
}

123
src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs

@ -24,21 +24,12 @@ namespace SixLabors.ImageSharp.PixelFormats
Guard.MustBeSizedAtLeast(sourceColors, count, nameof(sourceColors));
Guard.MustBeSizedAtLeast(destinationVectors, count, nameof(destinationVectors));
if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture)
{
// Doesn't worth to bother with SIMD:
ToVector4Common(sourceColors, destinationVectors, count);
return;
}
sourceColors = sourceColors.Slice(0, count);
destinationVectors = destinationVectors.Slice(0, count);
if (SimdUtils.ExtendedIntrinsics.IsAvailable)
{
ConvertToVector4UsingExtendedIntrinsics(sourceColors, destinationVectors, count);
}
else
{
ConvertToVector4UsingBasicIntrinsics(sourceColors, destinationVectors, count);
}
SimdUtils.BulkConvertByteToNormalizedFloat(
MemoryMarshal.Cast<Rgba32, byte>(sourceColors),
MemoryMarshal.Cast<Vector4, float>(destinationVectors));
}
/// <inheritdoc />
@ -46,20 +37,12 @@ namespace SixLabors.ImageSharp.PixelFormats
{
GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count);
if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture)
{
PackFromVector4Common(sourceVectors, destinationColors, count);
return;
}
sourceVectors = sourceVectors.Slice(0, count);
destinationColors = destinationColors.Slice(0, count);
if (SimdUtils.ExtendedIntrinsics.IsAvailable)
{
ConvertFromVector4ExtendedIntrinsics(sourceVectors, destinationColors, count);
}
else
{
ConvertFromVector4BasicIntrinsics(sourceVectors, destinationColors, count);
}
SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(
MemoryMarshal.Cast<Vector4, float>(sourceVectors),
MemoryMarshal.Cast<Rgba32, byte>(destinationColors));
}
/// <inheritdoc />
@ -89,92 +72,6 @@ namespace SixLabors.ImageSharp.PixelFormats
sourcePixels.Slice(0, count).CopyTo(dest);
}
private static void ConvertToVector4UsingExtendedIntrinsics(
ReadOnlySpan<Rgba32> sourceColors,
Span<Vector4> destinationVectors,
int count)
{
int remainder = count % 8;
int alignedCount = count - remainder;
if (alignedCount > 0)
{
ReadOnlySpan<byte> rawSrc = MemoryMarshal.Cast<Rgba32, byte>(sourceColors);
Span<float> rawDest = MemoryMarshal.Cast<Vector4, float>(destinationVectors.Slice(0, alignedCount));
SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(rawSrc, rawDest);
}
if (remainder > 0)
{
ToVector4Common(sourceColors.Slice(alignedCount), destinationVectors.Slice(alignedCount), remainder);
}
}
private static void ConvertToVector4UsingBasicIntrinsics(
ReadOnlySpan<Rgba32> sourceColors,
Span<Vector4> destinationVectors,
int count)
{
int remainder = count % 2;
int alignedCount = count - remainder;
if (alignedCount > 0)
{
ReadOnlySpan<byte> rawSrc = MemoryMarshal.Cast<Rgba32, byte>(sourceColors);
Span<float> rawDest = MemoryMarshal.Cast<Vector4, float>(destinationVectors.Slice(0, alignedCount));
SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(rawSrc, rawDest);
}
if (remainder > 0)
{
// actually: remainder == 1
int lastIdx = count - 1;
destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4();
}
}
private static void ConvertFromVector4ExtendedIntrinsics(ReadOnlySpan<Vector4> sourceVectors, Span<Rgba32> destinationColors, int count)
{
int remainder = count % 8;
int alignedCount = count - remainder;
if (alignedCount > 0)
{
ReadOnlySpan<float> rawSrc = MemoryMarshal.Cast<Vector4, float>(sourceVectors);
Span<byte> rawDest = MemoryMarshal.Cast<Rgba32, byte>(destinationColors.Slice(0, alignedCount));
SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
}
if (remainder > 0)
{
PackFromVector4Common(sourceVectors.Slice(alignedCount), destinationColors.Slice(alignedCount), remainder);
}
}
private static void ConvertFromVector4BasicIntrinsics(ReadOnlySpan<Vector4> sourceVectors, Span<Rgba32> destinationColors, int count)
{
int remainder = count % 2;
int alignedCount = count - remainder;
if (alignedCount > 0)
{
ReadOnlySpan<float> rawSrc = MemoryMarshal.Cast<Vector4, float>(sourceVectors.Slice(0, alignedCount));
Span<byte> rawDest = MemoryMarshal.Cast<Rgba32, byte>(destinationColors);
SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
}
if (remainder > 0)
{
// actually: remainder == 1
int lastIdx = count - 1;
destinationColors[lastIdx].PackFromVector4(sourceVectors[lastIdx]);
}
}
}
}
}

41
tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs

@ -25,7 +25,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
protected IMemoryOwner<TPixel> destination;
[Params(
//64,
64,
2048
)]
public int Count { get; set; }
@ -72,7 +72,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
public class PackFromVector4_Rgba32 : PackFromVector4<Rgba32>
{
[Benchmark]
public void FastDefault()
public void BasicBulk()
{
ref Vector4 sBase = ref this.source.GetSpan()[0];
ref Rgba32 dBase = ref this.destination.GetSpan()[0];
@ -112,16 +112,31 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
}
// RESULTS:
// Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Allocated |
// ----------------------------------------------------------------- |-------- |------ |----------:|----------:|----------:|-------:|---------:|----------:|
// FastDefault | Clr | 2048 | 15.989 us | 6.1384 us | 0.3468 us | 4.07 | 0.08 | 0 B |
// BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3.931 us | 0.6264 us | 0.0354 us | 1.00 | 0.00 | 0 B |
// ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 2.100 us | 0.4717 us | 0.0267 us | 0.53 | 0.01 | 0 B |
//
// | | | | | | | | |
// FastDefault | Core | 2048 | 14.693 us | 0.5131 us | 0.0290 us | 3.76 | 0.03 | 0 B |
// BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 3.913 us | 0.5661 us | 0.0320 us | 1.00 | 0.00 | 0 B |
// ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 1.966 us | 0.4056 us | 0.0229 us | 0.50 | 0.01 | 0 B |
// RESULTS (2018 October):
// Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated |
// ------------------------------------------------------------------ |-------- |------ |-------------:|-------------:|-----------:|-------:|---------:|-------:|----------:|
// BasicBulk | Clr | 64 | 581.62 ns | 33.625 ns | 1.8999 ns | 2.27 | 0.02 | - | 0 B |
// BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 64 | 256.66 ns | 45.153 ns | 2.5512 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 64 | 201.92 ns | 30.161 ns | 1.7042 ns | 0.79 | 0.01 | - | 0 B |
// PixelOperations_Base | Clr | 64 | 665.01 ns | 13.032 ns | 0.7363 ns | 2.59 | 0.02 | 0.0067 | 24 B |
// PixelOperations_Specialized | Clr | 64 | 295.14 ns | 26.335 ns | 1.4880 ns | 1.15 | 0.01 | - | 0 B |
// | | | | | | | | | |
// BasicBulk | Core | 64 | 513.22 ns | 91.110 ns | 5.1479 ns | 3.19 | 0.03 | - | 0 B |
// BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Core | 64 | 160.76 ns | 2.760 ns | 0.1559 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 64 | 95.98 ns | 10.077 ns | 0.5694 ns | 0.60 | 0.00 | - | 0 B |
// PixelOperations_Base | Core | 64 | 591.74 ns | 49.856 ns | 2.8170 ns | 3.68 | 0.01 | 0.0067 | 24 B |
// PixelOperations_Specialized | Core | 64 | 149.11 ns | 4.485 ns | 0.2534 ns | 0.93 | 0.00 | - | 0 B |
// | | | | | | | | | |
// BasicBulk | Clr | 2048 | 15,345.85 ns | 1,213.551 ns | 68.5679 ns | 3.90 | 0.01 | - | 0 B |
// BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3,939.49 ns | 71.101 ns | 4.0173 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 2,272.61 ns | 110.671 ns | 6.2531 ns | 0.58 | 0.00 | - | 0 B |
// PixelOperations_Base | Clr | 2048 | 17,422.47 ns | 811.733 ns | 45.8644 ns | 4.42 | 0.01 | - | 24 B |
// PixelOperations_Specialized | Clr | 2048 | 3,984.26 ns | 110.352 ns | 6.2351 ns | 1.01 | 0.00 | - | 0 B |
// | | | | | | | | | |
// BasicBulk | Core | 2048 | 14,950.43 ns | 699.309 ns | 39.5123 ns | 3.76 | 0.02 | - | 0 B |
// BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 3,978.28 ns | 481.105 ns | 27.1833 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 2,169.54 ns | 75.606 ns | 4.2719 ns | !!0.55!| 0.00 | - | 0 B |
// PixelOperations_Base | Core | 2048 | 18,403.62 ns | 1,494.056 ns | 84.4169 ns | 4.63 | 0.03 | - | 24 B |
// PixelOperations_Specialized | Core | 2048 | 2,227.60 ns | 486.761 ns | 27.5029 ns | !!0.56!| 0.01 | - | 0 B |
}
}

32
tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs

@ -28,7 +28,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
protected IMemoryOwner<Vector4> destination;
[Params(
//64,
64,
//256,
//512,
//1024,
@ -160,7 +160,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
}
}
[Benchmark]
//[Benchmark]
public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop()
{
Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
@ -201,5 +201,33 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
v *= scale;
return v;
}
// RESULTS (2018 October):
//
// Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated |
// ---------------------------------------------------- |-------- |------ |------------:|-------------:|-----------:|-------:|---------:|-------:|----------:|
// BasicBulk | Clr | 64 | 267.40 ns | 30.711 ns | 1.7352 ns | 1.07 | 0.01 | - | 0 B |
// BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Clr | 64 | 249.97 ns | 33.838 ns | 1.9119 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Clr | 64 | 176.97 ns | 5.221 ns | 0.2950 ns | 0.71 | 0.00 | - | 0 B |
// PixelOperations_Base | Clr | 64 | 349.70 ns | 104.331 ns | 5.8949 ns | 1.40 | 0.02 | 0.0072 | 24 B |
// PixelOperations_Specialized | Clr | 64 | 288.31 ns | 26.833 ns | 1.5161 ns | 1.15 | 0.01 | - | 0 B |
// | | | | | | | | | |
// BasicBulk | Core | 64 | 185.36 ns | 30.051 ns | 1.6979 ns | 1.26 | 0.01 | - | 0 B |
// BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Core | 64 | 146.84 ns | 12.674 ns | 0.7161 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Core | 64 | 67.31 ns | 2.542 ns | 0.1436 ns | 0.46 | 0.00 | - | 0 B |
// PixelOperations_Base | Core | 64 | 272.03 ns | 94.419 ns | 5.3348 ns | 1.85 | 0.03 | 0.0072 | 24 B |
// PixelOperations_Specialized | Core | 64 | 121.91 ns | 31.477 ns | 1.7785 ns | 0.83 | 0.01 | - | 0 B |
// | | | | | | | | | |
// BasicBulk | Clr | 2048 | 5,133.04 ns | 284.052 ns | 16.0494 ns | 1.21 | 0.01 | - | 0 B |
// BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Clr | 2048 | 4,248.58 ns | 1,095.887 ns | 61.9196 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Clr | 2048 | 1,214.02 ns | 184.349 ns | 10.4160 ns | 0.29 | 0.00 | - | 0 B |
// PixelOperations_Base | Clr | 2048 | 7,096.04 ns | 362.350 ns | 20.4734 ns | 1.67 | 0.02 | - | 24 B |
// PixelOperations_Specialized | Clr | 2048 | 4,314.19 ns | 204.964 ns | 11.5809 ns | 1.02 | 0.01 | - | 0 B |
// | | | | | | | | | |
// BasicBulk | Core | 2048 | 5,038.38 ns | 223.282 ns | 12.6158 ns | 1.20 | 0.01 | - | 0 B |
// BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Core | 2048 | 4,199.17 ns | 897.985 ns | 50.7378 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Core | 2048 | 1,113.86 ns | 64.799 ns | 3.6613 ns | !!0.27!| 0.00 | - | 0 B |
// PixelOperations_Base | Core | 2048 | 7,015.00 ns | 920.083 ns | 51.9864 ns | 1.67 | 0.02 | - | 24 B |
// PixelOperations_Specialized | Core | 2048 | 1,176.59 ns | 256.955 ns | 14.5184 ns | !!0.28!| 0.00 | - | 0 B |
}
}
Loading…
Cancel
Save