From 2fcda3cee0d4091678bf4a41bfcfa2b88a444949 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sun, 21 Oct 2018 01:21:36 +0200 Subject: [PATCH] simplify Rgba32.PixelOperations, include benchmark results --- .../PixelFormats/PixelOperations{TPixel}.cs | 60 ++++----- .../PixelFormats/Rgba32.PixelOperations.cs | 123 ++---------------- .../Color/Bulk/PackFromVector4.cs | 41 ++++-- .../Color/Bulk/ToVector4.cs | 32 ++++- 4 files changed, 94 insertions(+), 162 deletions(-) diff --git a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs index 39c442fe0..cbf164a71 100644 --- a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs +++ b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs @@ -29,7 +29,19 @@ namespace SixLabors.ImageSharp.PixelFormats /// The number of pixels to convert. internal virtual void PackFromVector4(ReadOnlySpan sourceVectors, Span destinationColors, int count) { - PackFromVector4Common(sourceVectors, destinationColors, count); + ReadOnlySpan sourceVectors1 = sourceVectors; + Span destinationColors1 = destinationColors; + GuardSpans(sourceVectors1, nameof(sourceVectors1), destinationColors1, nameof(destinationColors1), count); + + ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors1); + ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors1); + + for (int i = 0; i < count; i++) + { + ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i); + ref TPixel dp = ref Unsafe.Add(ref destRef, i); + dp.PackFromVector4(sp); + } } /// @@ -40,7 +52,19 @@ namespace SixLabors.ImageSharp.PixelFormats /// The number of pixels to convert. internal virtual void ToVector4(ReadOnlySpan sourceColors, Span destinationVectors, int count) { - ToVector4Common(sourceColors, destinationVectors, count); + ReadOnlySpan sourceColors1 = sourceColors; + Span destinationVectors1 = destinationVectors; + GuardSpans(sourceColors1, nameof(sourceColors1), destinationVectors1, nameof(destinationVectors1), count); + + ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors1); + ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors1); + + for (int i = 0; i < count; i++) + { + ref TPixel sp = ref Unsafe.Add(ref sourceRef, i); + ref Vector4 dp = ref Unsafe.Add(ref destRef, i); + dp = sp.ToVector4(); + } } /// @@ -106,37 +130,5 @@ namespace SixLabors.ImageSharp.PixelFormats Guard.MustBeSizedAtLeast(source, minLength, sourceParamName); Guard.MustBeSizedAtLeast(destination, minLength, destinationParamName); } - - [MethodImpl(InliningOptions.ShortMethod)] - internal static void PackFromVector4Common(ReadOnlySpan sourceVectors, Span destinationColors, int count) - { - GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count); - - ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors); - ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors); - - for (int i = 0; i < count; i++) - { - ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i); - ref TPixel dp = ref Unsafe.Add(ref destRef, i); - dp.PackFromVector4(sp); - } - } - - [MethodImpl(InliningOptions.ShortMethod)] - internal static void ToVector4Common(ReadOnlySpan sourceColors, Span destinationVectors, int count) - { - GuardSpans(sourceColors, nameof(sourceColors), destinationVectors, nameof(destinationVectors), count); - - ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors); - ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors); - - for (int i = 0; i < count; i++) - { - ref TPixel sp = ref Unsafe.Add(ref sourceRef, i); - ref Vector4 dp = ref Unsafe.Add(ref destRef, i); - dp = sp.ToVector4(); - } - } } } \ No newline at end of file diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs index 564b93ef5..bb42ec7e3 100644 --- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs @@ -24,21 +24,12 @@ namespace SixLabors.ImageSharp.PixelFormats Guard.MustBeSizedAtLeast(sourceColors, count, nameof(sourceColors)); Guard.MustBeSizedAtLeast(destinationVectors, count, nameof(destinationVectors)); - if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) - { - // Doesn't worth to bother with SIMD: - ToVector4Common(sourceColors, destinationVectors, count); - return; - } + sourceColors = sourceColors.Slice(0, count); + destinationVectors = destinationVectors.Slice(0, count); - if (SimdUtils.ExtendedIntrinsics.IsAvailable) - { - ConvertToVector4UsingExtendedIntrinsics(sourceColors, destinationVectors, count); - } - else - { - ConvertToVector4UsingBasicIntrinsics(sourceColors, destinationVectors, count); - } + SimdUtils.BulkConvertByteToNormalizedFloat( + MemoryMarshal.Cast(sourceColors), + MemoryMarshal.Cast(destinationVectors)); } /// @@ -46,20 +37,12 @@ namespace SixLabors.ImageSharp.PixelFormats { GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count); - if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) - { - PackFromVector4Common(sourceVectors, destinationColors, count); - return; - } + sourceVectors = sourceVectors.Slice(0, count); + destinationColors = destinationColors.Slice(0, count); - if (SimdUtils.ExtendedIntrinsics.IsAvailable) - { - ConvertFromVector4ExtendedIntrinsics(sourceVectors, destinationColors, count); - } - else - { - ConvertFromVector4BasicIntrinsics(sourceVectors, destinationColors, count); - } + SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows( + MemoryMarshal.Cast(sourceVectors), + MemoryMarshal.Cast(destinationColors)); } /// @@ -89,92 +72,6 @@ namespace SixLabors.ImageSharp.PixelFormats sourcePixels.Slice(0, count).CopyTo(dest); } - - private static void ConvertToVector4UsingExtendedIntrinsics( - ReadOnlySpan sourceColors, - Span destinationVectors, - int count) - { - int remainder = count % 8; - int alignedCount = count - remainder; - - if (alignedCount > 0) - { - ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); - Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); - - SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); - } - - if (remainder > 0) - { - ToVector4Common(sourceColors.Slice(alignedCount), destinationVectors.Slice(alignedCount), remainder); - } - } - - private static void ConvertToVector4UsingBasicIntrinsics( - ReadOnlySpan sourceColors, - Span destinationVectors, - int count) - { - int remainder = count % 2; - int alignedCount = count - remainder; - - if (alignedCount > 0) - { - ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); - Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); - - SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); - } - - if (remainder > 0) - { - // actually: remainder == 1 - int lastIdx = count - 1; - destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4(); - } - } - - private static void ConvertFromVector4ExtendedIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count) - { - int remainder = count % 8; - int alignedCount = count - remainder; - - if (alignedCount > 0) - { - ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors); - Span rawDest = MemoryMarshal.Cast(destinationColors.Slice(0, alignedCount)); - - SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); - } - - if (remainder > 0) - { - PackFromVector4Common(sourceVectors.Slice(alignedCount), destinationColors.Slice(alignedCount), remainder); - } - } - - private static void ConvertFromVector4BasicIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count) - { - int remainder = count % 2; - int alignedCount = count - remainder; - - if (alignedCount > 0) - { - ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount)); - Span rawDest = MemoryMarshal.Cast(destinationColors); - - SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); - } - - if (remainder > 0) - { - // actually: remainder == 1 - int lastIdx = count - 1; - destinationColors[lastIdx].PackFromVector4(sourceVectors[lastIdx]); - } - } } } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index eb7154955..7a212b052 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -25,7 +25,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk protected IMemoryOwner destination; [Params( - //64, + 64, 2048 )] public int Count { get; set; } @@ -72,7 +72,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk public class PackFromVector4_Rgba32 : PackFromVector4 { [Benchmark] - public void FastDefault() + public void BasicBulk() { ref Vector4 sBase = ref this.source.GetSpan()[0]; ref Rgba32 dBase = ref this.destination.GetSpan()[0]; @@ -112,16 +112,31 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); } - // RESULTS: - // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Allocated | - // ----------------------------------------------------------------- |-------- |------ |----------:|----------:|----------:|-------:|---------:|----------:| - // FastDefault | Clr | 2048 | 15.989 us | 6.1384 us | 0.3468 us | 4.07 | 0.08 | 0 B | - // BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3.931 us | 0.6264 us | 0.0354 us | 1.00 | 0.00 | 0 B | - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 2.100 us | 0.4717 us | 0.0267 us | 0.53 | 0.01 | 0 B | - // - // | | | | | | | | | - // FastDefault | Core | 2048 | 14.693 us | 0.5131 us | 0.0290 us | 3.76 | 0.03 | 0 B | - // BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 3.913 us | 0.5661 us | 0.0320 us | 1.00 | 0.00 | 0 B | - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 1.966 us | 0.4056 us | 0.0229 us | 0.50 | 0.01 | 0 B | + // RESULTS (2018 October): + // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated | + // ------------------------------------------------------------------ |-------- |------ |-------------:|-------------:|-----------:|-------:|---------:|-------:|----------:| + // BasicBulk | Clr | 64 | 581.62 ns | 33.625 ns | 1.8999 ns | 2.27 | 0.02 | - | 0 B | + // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 64 | 256.66 ns | 45.153 ns | 2.5512 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 64 | 201.92 ns | 30.161 ns | 1.7042 ns | 0.79 | 0.01 | - | 0 B | + // PixelOperations_Base | Clr | 64 | 665.01 ns | 13.032 ns | 0.7363 ns | 2.59 | 0.02 | 0.0067 | 24 B | + // PixelOperations_Specialized | Clr | 64 | 295.14 ns | 26.335 ns | 1.4880 ns | 1.15 | 0.01 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Core | 64 | 513.22 ns | 91.110 ns | 5.1479 ns | 3.19 | 0.03 | - | 0 B | + // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Core | 64 | 160.76 ns | 2.760 ns | 0.1559 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 64 | 95.98 ns | 10.077 ns | 0.5694 ns | 0.60 | 0.00 | - | 0 B | + // PixelOperations_Base | Core | 64 | 591.74 ns | 49.856 ns | 2.8170 ns | 3.68 | 0.01 | 0.0067 | 24 B | + // PixelOperations_Specialized | Core | 64 | 149.11 ns | 4.485 ns | 0.2534 ns | 0.93 | 0.00 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Clr | 2048 | 15,345.85 ns | 1,213.551 ns | 68.5679 ns | 3.90 | 0.01 | - | 0 B | + // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3,939.49 ns | 71.101 ns | 4.0173 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 2,272.61 ns | 110.671 ns | 6.2531 ns | 0.58 | 0.00 | - | 0 B | + // PixelOperations_Base | Clr | 2048 | 17,422.47 ns | 811.733 ns | 45.8644 ns | 4.42 | 0.01 | - | 24 B | + // PixelOperations_Specialized | Clr | 2048 | 3,984.26 ns | 110.352 ns | 6.2351 ns | 1.01 | 0.00 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Core | 2048 | 14,950.43 ns | 699.309 ns | 39.5123 ns | 3.76 | 0.02 | - | 0 B | + // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 3,978.28 ns | 481.105 ns | 27.1833 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 2,169.54 ns | 75.606 ns | 4.2719 ns | !!0.55!| 0.00 | - | 0 B | + // PixelOperations_Base | Core | 2048 | 18,403.62 ns | 1,494.056 ns | 84.4169 ns | 4.63 | 0.03 | - | 24 B | + // PixelOperations_Specialized | Core | 2048 | 2,227.60 ns | 486.761 ns | 27.5029 ns | !!0.56!| 0.01 | - | 0 B | } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index c50c7ce5a..4a801d64e 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -28,7 +28,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk protected IMemoryOwner destination; [Params( - //64, + 64, //256, //512, //1024, @@ -160,7 +160,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - [Benchmark] + //[Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -201,5 +201,33 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk v *= scale; return v; } + + // RESULTS (2018 October): + // + // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated | + // ---------------------------------------------------- |-------- |------ |------------:|-------------:|-----------:|-------:|---------:|-------:|----------:| + // BasicBulk | Clr | 64 | 267.40 ns | 30.711 ns | 1.7352 ns | 1.07 | 0.01 | - | 0 B | + // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Clr | 64 | 249.97 ns | 33.838 ns | 1.9119 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Clr | 64 | 176.97 ns | 5.221 ns | 0.2950 ns | 0.71 | 0.00 | - | 0 B | + // PixelOperations_Base | Clr | 64 | 349.70 ns | 104.331 ns | 5.8949 ns | 1.40 | 0.02 | 0.0072 | 24 B | + // PixelOperations_Specialized | Clr | 64 | 288.31 ns | 26.833 ns | 1.5161 ns | 1.15 | 0.01 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Core | 64 | 185.36 ns | 30.051 ns | 1.6979 ns | 1.26 | 0.01 | - | 0 B | + // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Core | 64 | 146.84 ns | 12.674 ns | 0.7161 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Core | 64 | 67.31 ns | 2.542 ns | 0.1436 ns | 0.46 | 0.00 | - | 0 B | + // PixelOperations_Base | Core | 64 | 272.03 ns | 94.419 ns | 5.3348 ns | 1.85 | 0.03 | 0.0072 | 24 B | + // PixelOperations_Specialized | Core | 64 | 121.91 ns | 31.477 ns | 1.7785 ns | 0.83 | 0.01 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Clr | 2048 | 5,133.04 ns | 284.052 ns | 16.0494 ns | 1.21 | 0.01 | - | 0 B | + // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Clr | 2048 | 4,248.58 ns | 1,095.887 ns | 61.9196 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Clr | 2048 | 1,214.02 ns | 184.349 ns | 10.4160 ns | 0.29 | 0.00 | - | 0 B | + // PixelOperations_Base | Clr | 2048 | 7,096.04 ns | 362.350 ns | 20.4734 ns | 1.67 | 0.02 | - | 24 B | + // PixelOperations_Specialized | Clr | 2048 | 4,314.19 ns | 204.964 ns | 11.5809 ns | 1.02 | 0.01 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Core | 2048 | 5,038.38 ns | 223.282 ns | 12.6158 ns | 1.20 | 0.01 | - | 0 B | + // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Core | 2048 | 4,199.17 ns | 897.985 ns | 50.7378 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Core | 2048 | 1,113.86 ns | 64.799 ns | 3.6613 ns | !!0.27!| 0.00 | - | 0 B | + // PixelOperations_Base | Core | 2048 | 7,015.00 ns | 920.083 ns | 51.9864 ns | 1.67 | 0.02 | - | 24 B | + // PixelOperations_Specialized | Core | 2048 | 1,176.59 ns | 256.955 ns | 14.5184 ns | !!0.28!| 0.00 | - | 0 B | } } \ No newline at end of file