From 260a8f8c9a3d0730e836a606e10e5802d5b0fe1e Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sun, 14 Oct 2018 22:43:02 +0200 Subject: [PATCH 01/22] BulkConvertByteToNormalizedFloat --- src/ImageSharp/Common/Extensions/SimdUtils.cs | 75 +- .../PixelFormats/Rgba32.PixelOperations.cs | 12 +- .../Color/Bulk/ToVector4.cs | 31 +- .../PixelFormats/PixelOperationsTests.cs | 1384 +++++++++-------- 4 files changed, 802 insertions(+), 700 deletions(-) diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.cs b/src/ImageSharp/Common/Extensions/SimdUtils.cs index 7b77fefca..db1e80dda 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.cs +++ b/src/ImageSharp/Common/Extensions/SimdUtils.cs @@ -7,6 +7,8 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using SixLabors.ImageSharp.PixelFormats; + namespace SixLabors.ImageSharp { /// @@ -103,6 +105,47 @@ namespace SixLabors.ImageSharp } } + internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) + { + if (!Vector.IsHardwareAccelerated) + { + throw new InvalidOperationException( + "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!"); + } + + DebugGuard.IsTrue((dest.Length % Vector.Count) == 0, nameof(source), "dest.Length should be divisable by Vector.Count!"); + + var bVec = new Vector(256.0f / 255.0f); + var magicFloat = new Vector(32768.0f); + var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f + var mask = new Vector(255); + + ref Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + + ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); + + int n = dest.Length / 8; + Octet.OfUInt32 temp = default; + + for (int i = 0; i < n; i++) + { + Octet.OfByte sVal = Unsafe.Add(ref sourceBase, i); + + // This call is the bottleneck now: + temp.LoadFrom(ref sVal); + + Vector vi = Unsafe.As>(ref temp); + vi &= mask; + vi |= magicInt; + + var vf = Vector.AsVectorSingle(vi); + vf = (vf - magicFloat) * bVec; + + Unsafe.Add(ref destBaseAsFloat, i) = vf; + } + } + /// /// Same as but clamps overflown values before conversion. /// @@ -181,6 +224,19 @@ namespace SixLabors.ImageSharp { return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; } + + [MethodImpl(InliningOptions.ShortMethod)] + public void LoadFrom(ref OfByte src) + { + this.V0 = src.V0; + this.V1 = src.V1; + this.V2 = src.V2; + this.V3 = src.V3; + this.V4 = src.V4; + this.V5 = src.V5; + this.V6 = src.V6; + this.V7 = src.V7; + } } [StructLayout(LayoutKind.Explicit, Size = 8)] @@ -215,16 +271,17 @@ namespace SixLabors.ImageSharp return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; } - public void LoadFrom(ref OfUInt32 i) + [MethodImpl(InliningOptions.ShortMethod)] + public void LoadFrom(ref OfUInt32 src) { - this.V0 = (byte)i.V0; - this.V1 = (byte)i.V1; - this.V2 = (byte)i.V2; - this.V3 = (byte)i.V3; - this.V4 = (byte)i.V4; - this.V5 = (byte)i.V5; - this.V6 = (byte)i.V6; - this.V7 = (byte)i.V7; + this.V0 = (byte)src.V0; + this.V1 = (byte)src.V1; + this.V2 = (byte)src.V2; + this.V3 = (byte)src.V3; + this.V4 = (byte)src.V4; + this.V5 = (byte)src.V5; + this.V6 = (byte)src.V6; + this.V7 = (byte)src.V7; } } } diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs index 2629ce3f7..76e119ba4 100644 --- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs @@ -57,14 +57,14 @@ namespace SixLabors.ImageSharp.PixelFormats int unpackedRawCount = count * 4; ref uint sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(sourceColors)); - ref UnpackedRGBA destBaseAsUnpacked = ref Unsafe.As(ref MemoryMarshal.GetReference(destVectors)); - ref Vector destBaseAsUInt = ref Unsafe.As>(ref destBaseAsUnpacked); - ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsUnpacked); + ref WideRgba destBaseAsWide = ref Unsafe.As(ref MemoryMarshal.GetReference(destVectors)); + ref Vector destBaseAsUInt = ref Unsafe.As>(ref destBaseAsWide); + ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWide); for (int i = 0; i < count; i++) { uint sVal = Unsafe.Add(ref sourceBase, i); - ref UnpackedRGBA dst = ref Unsafe.Add(ref destBaseAsUnpacked, i); + ref WideRgba dst = ref Unsafe.Add(ref destBaseAsWide, i); // This call is the bottleneck now: dst.Load(sVal); @@ -174,10 +174,10 @@ namespace SixLabors.ImageSharp.PixelFormats } /// - /// Value type to store -s unpacked into multiple -s. + /// Value type to store -s widened into multiple -s. /// [StructLayout(LayoutKind.Sequential)] - private struct UnpackedRGBA + private struct WideRgba { private uint r; diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 50fac2513..7b6f902d8 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -6,8 +6,13 @@ using System.Buffers; using System; using System.Numerics; +using System.Runtime.InteropServices; using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Attributes.Jobs; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Environments; +using BenchmarkDotNet.Jobs; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -17,11 +22,13 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk public abstract class ToVector4 where TPixel : struct, IPixel { - private IMemoryOwner source; + protected IMemoryOwner source; - private IMemoryOwner destination; + protected IMemoryOwner destination; - [Params(64, 300, 1024)] + [Params( + //64, + 1024)] public int Count { get; set; } [GlobalSetup] @@ -38,7 +45,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk this.destination.Dispose(); } - [Benchmark(Baseline = true)] + [Benchmark] public void PerElement() { Span s = this.source.GetSpan(); @@ -51,7 +58,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - [Benchmark] + [Benchmark(Baseline = true)] public void CommonBulk() { new PixelOperations().ToVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); @@ -64,7 +71,21 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } + [CoreJob] + //[ClrJob] public class ToVector4_Rgba32 : ToVector4 { + class Config : ManualConfig + { + } + + [Benchmark] + public void BulkConvertByteToNormalizedFloat() + { + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + + SimdUtils.BulkConvertByteToNormalizedFloat(sBytes, dFloats); + } } } \ No newline at end of file diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs index 9e41fd94f..a96da03e7 100644 --- a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs +++ b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs @@ -1,93 +1,117 @@ -// Copyright (c) Six Labors and contributors. -// Licensed under the Apache License, Version 2.0. - -using System; -using System.Buffers; -using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using SixLabors.ImageSharp.Memory; -using SixLabors.ImageSharp.PixelFormats; -using Xunit; -using Xunit.Abstractions; - -namespace SixLabors.ImageSharp.Tests.PixelFormats -{ - public partial class PixelOperationsTests - { - public class Rgba32 : PixelOperationsTests - { - public Rgba32(ITestOutputHelper output) - : base(output) - { - } - - // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class: - public static new TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; - - [Fact] - public void IsSpecialImplementation() - { - Assert.IsType(PixelOperations.Instance); - } - - [Fact] - public void ToVector4SimdAligned() - { - if (!Vector.IsHardwareAccelerated) - { - return; - } - - ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(64); - Vector4[] expected = CreateExpectedVector4Data(source); - - TestOperation( - source, - expected, - (s, d) => ImageSharp.PixelFormats.Rgba32.PixelOperations.ToVector4SimdAligned(s, d.GetSpan(), 64) - ); - } - - - // [Fact] // Profiling benchmark - enable manually! -#pragma warning disable xUnit1013 // Public method should be marked as test - public void Benchmark_ToVector4() -#pragma warning restore xUnit1013 // Public method should be marked as test - { - int times = 200000; - int count = 1024; - - using (IMemoryOwner source = Configuration.Default.MemoryAllocator.Allocate(count)) - using (IMemoryOwner dest = Configuration.Default.MemoryAllocator.Allocate(count)) - { - this.Measure( - times, - () => - { - PixelOperations.Instance.ToVector4(source.GetSpan(), dest.GetSpan(), count); - }); - } - } - } - - public class Argb32 : PixelOperationsTests - { - // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class: - public Argb32(ITestOutputHelper output) - : base(output) - { - } - - public static new TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; - } - - [Theory] - [WithBlankImages(1, 1, PixelTypes.All)] - public void GetGlobalInstance(TestImageProvider dummy) - where TPixel : struct, IPixel - { - Assert.NotNull(PixelOperations.Instance); +// Copyright (c) Six Labors and contributors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Buffers; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using SixLabors.ImageSharp.Memory; +using SixLabors.ImageSharp.PixelFormats; +using Xunit; +using Xunit.Abstractions; + +namespace SixLabors.ImageSharp.Tests.PixelFormats +{ + public partial class PixelOperationsTests + { + public class Rgba32 : PixelOperationsTests + { + public Rgba32(ITestOutputHelper output) + : base(output) + { + } + + // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class: + public static new TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; + + [Fact] + public void IsSpecialImplementation() + { + Assert.IsType(PixelOperations.Instance); + } + + [Fact] + public void ToVector4SimdAligned() + { + if (!Vector.IsHardwareAccelerated) + { + return; + } + + ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(64); + Vector4[] expected = CreateExpectedVector4Data(source); + + TestOperation( + source, + expected, + (s, d) => ImageSharp.PixelFormats.Rgba32.PixelOperations.ToVector4SimdAligned(s, d.GetSpan(), 64) + ); + } + + [Fact] + public void BulkConvertByteToNormalizedFloat() + { + if (!Vector.IsHardwareAccelerated) + { + return; + } + + ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(64); + Vector4[] expected = CreateExpectedVector4Data(source); + + TestOperation( + source, + expected, + (s, d) => + { + ReadOnlySpan sBytes = MemoryMarshal.Cast(s); + Span dFloats = MemoryMarshal.Cast(d.Memory.Span); + + SimdUtils.BulkConvertByteToNormalizedFloat(sBytes, dFloats); + } + ); + } + + + // [Fact] // Profiling benchmark - enable manually! +#pragma warning disable xUnit1013 // Public method should be marked as test + public void Benchmark_ToVector4() +#pragma warning restore xUnit1013 // Public method should be marked as test + { + int times = 200000; + int count = 1024; + + using (IMemoryOwner source = Configuration.Default.MemoryAllocator.Allocate(count)) + using (IMemoryOwner dest = Configuration.Default.MemoryAllocator.Allocate(count)) + { + this.Measure( + times, + () => + { + PixelOperations.Instance.ToVector4(source.GetSpan(), dest.GetSpan(), count); + }); + } + } + } + + public class Argb32 : PixelOperationsTests + { + // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class: + public Argb32(ITestOutputHelper output) + : base(output) + { + } + + public static new TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; + } + + [Theory] + [WithBlankImages(1, 1, PixelTypes.All)] + public void GetGlobalInstance(TestImageProvider dummy) + where TPixel : struct, IPixel + { + Assert.NotNull(PixelOperations.Instance); } [Fact] @@ -99,594 +123,594 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats Assert.False(new GraphicsOptions(true).IsOpaqueColorWithoutBlending(ImageSharp.PixelFormats.Rgba32.Transparent)); Assert.False(new GraphicsOptions(true, PixelColorBlendingMode.Lighten, 1).IsOpaqueColorWithoutBlending(ImageSharp.PixelFormats.Rgba32.Red)); Assert.False(new GraphicsOptions(true, PixelColorBlendingMode.Normal,PixelAlphaCompositionMode.DestOver, 1).IsOpaqueColorWithoutBlending(ImageSharp.PixelFormats.Rgba32.Red)); - } - } - - public abstract class PixelOperationsTests : MeasureFixture - where TPixel : struct, IPixel - { - protected PixelOperationsTests(ITestOutputHelper output) - : base(output) - { - } - - public static TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; - - private static PixelOperations Operations => PixelOperations.Instance; - - internal static TPixel[] CreateExpectedPixelData(Vector4[] source) - { - var expected = new TPixel[source.Length]; - - for (int i = 0; i < expected.Length; i++) - { - expected[i].PackFromVector4(source[i]); - } - return expected; - } - - internal static TPixel[] CreateScaledExpectedPixelData(Vector4[] source) - { - var expected = new TPixel[source.Length]; - - for (int i = 0; i < expected.Length; i++) - { - expected[i].PackFromScaledVector4(source[i]); - } - return expected; - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void PackFromVector4(int count) - { - Vector4[] source = CreateVector4TestData(count); - TPixel[] expected = CreateExpectedPixelData(source); - - TestOperation( - source, - expected, - (s, d) => Operations.PackFromVector4(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void PackFromScaledVector4(int count) - { - Vector4[] source = CreateVector4TestData(count); - TPixel[] expected = CreateScaledExpectedPixelData(source); - - TestOperation( - source, - expected, - (s, d) => Operations.PackFromScaledVector4(s, d.GetSpan(), count) - ); - } - - internal static Vector4[] CreateExpectedVector4Data(TPixel[] source) - { - var expected = new Vector4[source.Length]; - - for (int i = 0; i < expected.Length; i++) - { - expected[i] = source[i].ToVector4(); - } - return expected; - } - - internal static Vector4[] CreateExpectedScaledVector4Data(TPixel[] source) - { - var expected = new Vector4[source.Length]; - - for (int i = 0; i < expected.Length; i++) - { - expected[i] = source[i].ToScaledVector4(); - } - return expected; - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void ToVector4(int count) - { - TPixel[] source = CreatePixelTestData(count); - Vector4[] expected = CreateExpectedVector4Data(source); - - TestOperation( - source, - expected, - (s, d) => Operations.ToVector4(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void ToScaledVector4(int count) - { - TPixel[] source = CreateScaledPixelTestData(count); - Vector4[] expected = CreateExpectedScaledVector4Data(source); - - TestOperation( - source, - expected, - (s, d) => Operations.ToScaledVector4(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void PackFromRgb24Bytes(int count) - { - byte[] source = CreateByteTestData(count * 3); - var expected = new TPixel[count]; - - for (int i = 0; i < count; i++) - { - int i3 = i * 3; - - expected[i].PackFromRgba32(new Rgba32(source[i3 + 0], source[i3 + 1], source[i3 + 2], 255)); - } - - TestOperation( - source, - expected, - (s, d) => Operations.PackFromRgb24Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void ToRgb24Bytes(int count) - { - TPixel[] source = CreatePixelTestData(count); - byte[] expected = new byte[count * 3]; - var rgb = default(Rgb24); - - for (int i = 0; i < count; i++) - { - int i3 = i * 3; - source[i].ToRgb24(ref rgb); - expected[i3] = rgb.R; - expected[i3 + 1] = rgb.G; - expected[i3 + 2] = rgb.B; - } - - TestOperation( - source, - expected, - (s, d) => Operations.ToRgb24Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void PackFromRgba32Bytes(int count) - { - byte[] source = CreateByteTestData(count * 4); - var expected = new TPixel[count]; - - for (int i = 0; i < count; i++) - { - int i4 = i * 4; - - expected[i].PackFromRgba32(new Rgba32(source[i4 + 0], source[i4 + 1], source[i4 + 2], source[i4 + 3])); - } - - TestOperation( - source, - expected, - (s, d) => Operations.PackFromRgba32Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void ToRgba32Bytes(int count) - { - TPixel[] source = CreatePixelTestData(count); - byte[] expected = new byte[count * 4]; - var rgba = default(Rgba32); - - for (int i = 0; i < count; i++) - { - int i4 = i * 4; - source[i].ToRgba32(ref rgba); - expected[i4] = rgba.R; - expected[i4 + 1] = rgba.G; - expected[i4 + 2] = rgba.B; - expected[i4 + 3] = rgba.A; - } - - TestOperation( - source, - expected, - (s, d) => Operations.ToRgba32Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void PackFromRgb48Bytes(int count) - { - byte[] source = CreateByteTestData(count * 6); - Span sourceSpan = source.AsSpan(); - var expected = new TPixel[count]; - - var rgba64 = new Rgba64(0, 0, 0, 65535); - for (int i = 0; i < count; i++) - { - int i6 = i * 6; - rgba64.Rgb = MemoryMarshal.Cast(sourceSpan.Slice(i6, 6))[0]; - expected[i].PackFromRgba64(rgba64); - } - - TestOperation( - source, - expected, - (s, d) => Operations.PackFromRgb48Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void ToRgb48Bytes(int count) - { - TPixel[] source = CreatePixelTestData(count); - byte[] expected = new byte[count * 6]; - Rgb48 rgb = default; - - for (int i = 0; i < count; i++) - { - int i6 = i * 6; - source[i].ToRgb48(ref rgb); - Rgba64Bytes rgb48Bytes = Unsafe.As(ref rgb); - expected[i6] = rgb48Bytes[0]; - expected[i6 + 1] = rgb48Bytes[1]; - expected[i6 + 2] = rgb48Bytes[2]; - expected[i6 + 3] = rgb48Bytes[3]; - expected[i6 + 4] = rgb48Bytes[4]; - expected[i6 + 5] = rgb48Bytes[5]; - } - - TestOperation( - source, - expected, - (s, d) => Operations.ToRgb48Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void PackFromRgba64Bytes(int count) - { - byte[] source = CreateByteTestData(count * 8); - Span sourceSpan = source.AsSpan(); - var expected = new TPixel[count]; - - for (int i = 0; i < count; i++) - { - int i8 = i * 8; - expected[i].PackFromRgba64(MemoryMarshal.Cast(sourceSpan.Slice(i8, 8))[0]); - } - - TestOperation( - source, - expected, - (s, d) => Operations.PackFromRgba64Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void ToRgba64Bytes(int count) - { - TPixel[] source = CreatePixelTestData(count); - byte[] expected = new byte[count * 8]; - Rgba64 rgba = default; - - for (int i = 0; i < count; i++) - { - int i8 = i * 8; - source[i].ToRgba64(ref rgba); - Rgba64Bytes rgba64Bytes = Unsafe.As(ref rgba); - expected[i8] = rgba64Bytes[0]; - expected[i8 + 1] = rgba64Bytes[1]; - expected[i8 + 2] = rgba64Bytes[2]; - expected[i8 + 3] = rgba64Bytes[3]; - expected[i8 + 4] = rgba64Bytes[4]; - expected[i8 + 5] = rgba64Bytes[5]; - expected[i8 + 6] = rgba64Bytes[6]; - expected[i8 + 7] = rgba64Bytes[7]; - } - - TestOperation( - source, - expected, - (s, d) => Operations.ToRgba64Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void PackFromBgr24Bytes(int count) - { - byte[] source = CreateByteTestData(count * 3); - var expected = new TPixel[count]; - - for (int i = 0; i < count; i++) - { - int i3 = i * 3; - - expected[i].PackFromRgba32(new Rgba32(source[i3 + 2], source[i3 + 1], source[i3 + 0], 255)); - } - - TestOperation( - source, - expected, - (s, d) => Operations.PackFromBgr24Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void ToBgr24Bytes(int count) - { - TPixel[] source = CreatePixelTestData(count); - byte[] expected = new byte[count * 3]; - var bgr = default(Bgr24); - - for (int i = 0; i < count; i++) - { - int i3 = i * 3; - source[i].ToBgr24(ref bgr); - expected[i3] = bgr.B; - expected[i3 + 1] = bgr.G; - expected[i3 + 2] = bgr.R; - } - - TestOperation( - source, - expected, - (s, d) => Operations.ToBgr24Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void PackFromBgra32Bytes(int count) - { - byte[] source = CreateByteTestData(count * 4); - var expected = new TPixel[count]; - - for (int i = 0; i < count; i++) - { - int i4 = i * 4; - - expected[i].PackFromRgba32(new Rgba32(source[i4 + 2], source[i4 + 1], source[i4 + 0], source[i4 + 3])); - } - - TestOperation( - source, - expected, - (s, d) => Operations.PackFromBgra32Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void ToZyxwBytes(int count) - { - TPixel[] source = CreatePixelTestData(count); - byte[] expected = new byte[count * 4]; - var bgra = default(Bgra32); - - for (int i = 0; i < count; i++) - { - int i4 = i * 4; - source[i].ToBgra32(ref bgra); - expected[i4] = bgra.B; - expected[i4 + 1] = bgra.G; - expected[i4 + 2] = bgra.R; - expected[i4 + 3] = bgra.A; - } - - TestOperation( - source, - expected, - (s, d) => Operations.ToBgra32Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void PackFromArgb32Bytes(int count) - { - byte[] source = CreateByteTestData(count * 4); - var expected = new TPixel[count]; - - for (int i = 0; i < count; i++) - { - int i4 = i * 4; - - expected[i].PackFromRgba32(new Rgba32(source[i4 + 1], source[i4 + 2], source[i4 + 3], source[i4 + 0])); - } - - TestOperation( - source, - expected, - (s, d) => Operations.PackFromArgb32Bytes(s, d.GetSpan(), count) - ); - } - - [Theory] - [MemberData(nameof(ArraySizesData))] - public void ToArgb32Bytes(int count) - { - TPixel[] source = CreatePixelTestData(count); - byte[] expected = new byte[count * 4]; - var argb = default(Argb32); - - for (int i = 0; i < count; i++) - { - int i4 = i * 4; - source[i].ToArgb32(ref argb); - expected[i4] = argb.A; - expected[i4 + 1] = argb.R; - expected[i4 + 2] = argb.G; - expected[i4 + 3] = argb.B; - } - - TestOperation( - source, - expected, - (s, d) => Operations.ToArgb32Bytes(s, d.GetSpan(), count) - ); - } - - private class TestBuffers : IDisposable - where TSource : struct - where TDest : struct - { - public TSource[] SourceBuffer { get; } - public IMemoryOwner ActualDestBuffer { get; } - public TDest[] ExpectedDestBuffer { get; } - - public TestBuffers(TSource[] source, TDest[] expectedDest) - { - this.SourceBuffer = source; - this.ExpectedDestBuffer = expectedDest; - this.ActualDestBuffer = Configuration.Default.MemoryAllocator.Allocate(expectedDest.Length); - } - - public void Dispose() - { - this.ActualDestBuffer.Dispose(); - } - - private const float Tolerance = 0.0001f; - - public void Verify() - { - int count = this.ExpectedDestBuffer.Length; - - if (typeof(TDest) == typeof(Vector4)) - { - - Span expected = MemoryMarshal.Cast(this.ExpectedDestBuffer.AsSpan()); - Span actual = MemoryMarshal.Cast(this.ActualDestBuffer.GetSpan()); - - for (int i = 0; i < count; i++) - { - // ReSharper disable PossibleNullReferenceException - Assert.Equal(expected[i], actual[i], new ApproximateFloatComparer(0.001f)); - // ReSharper restore PossibleNullReferenceException - } - } - else - { - Span expected = this.ExpectedDestBuffer.AsSpan(); - Span actual = this.ActualDestBuffer.GetSpan(); - for (int i = 0; i < count; i++) - { - Assert.Equal(expected[i], actual[i]); - } - } - } - } - - internal static void TestOperation( - TSource[] source, - TDest[] expected, - Action> action) - where TSource : struct - where TDest : struct - { - using (var buffers = new TestBuffers(source, expected)) - { - action(buffers.SourceBuffer, buffers.ActualDestBuffer); - buffers.Verify(); - } - } - - internal static Vector4[] CreateVector4TestData(int length) - { - var result = new Vector4[length]; - var rnd = new Random(42); // Deterministic random values - - for (int i = 0; i < result.Length; i++) - { - result[i] = GetVector(rnd); - } - return result; - } - - internal static TPixel[] CreatePixelTestData(int length) - { - var result = new TPixel[length]; - - var rnd = new Random(42); // Deterministic random values - - for (int i = 0; i < result.Length; i++) - { - Vector4 v = GetVector(rnd); - result[i].PackFromVector4(v); - } - - return result; - } - - internal static TPixel[] CreateScaledPixelTestData(int length) - { - var result = new TPixel[length]; - - var rnd = new Random(42); // Deterministic random values - - for (int i = 0; i < result.Length; i++) - { - Vector4 v = GetVector(rnd); - result[i].PackFromScaledVector4(v); - } - - return result; - } - - internal static byte[] CreateByteTestData(int length) - { - byte[] result = new byte[length]; - var rnd = new Random(42); // Deterministic random values - - for (int i = 0; i < result.Length; i++) - { - result[i] = (byte)rnd.Next(255); - } - return result; - } - - internal static Vector4 GetVector(Random rnd) - { - return new Vector4( - (float)rnd.NextDouble(), - (float)rnd.NextDouble(), - (float)rnd.NextDouble(), - (float)rnd.NextDouble() - ); - } - - [StructLayout(LayoutKind.Sequential)] - private unsafe struct Rgba64Bytes - { - public fixed byte Data[8]; - - public byte this[int idx] - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get - { - ref byte self = ref Unsafe.As(ref this); - return Unsafe.Add(ref self, idx); - } - } - } - } + } + } + + public abstract class PixelOperationsTests : MeasureFixture + where TPixel : struct, IPixel + { + protected PixelOperationsTests(ITestOutputHelper output) + : base(output) + { + } + + public static TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; + + private static PixelOperations Operations => PixelOperations.Instance; + + internal static TPixel[] CreateExpectedPixelData(Vector4[] source) + { + var expected = new TPixel[source.Length]; + + for (int i = 0; i < expected.Length; i++) + { + expected[i].PackFromVector4(source[i]); + } + return expected; + } + + internal static TPixel[] CreateScaledExpectedPixelData(Vector4[] source) + { + var expected = new TPixel[source.Length]; + + for (int i = 0; i < expected.Length; i++) + { + expected[i].PackFromScaledVector4(source[i]); + } + return expected; + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void PackFromVector4(int count) + { + Vector4[] source = CreateVector4TestData(count); + TPixel[] expected = CreateExpectedPixelData(source); + + TestOperation( + source, + expected, + (s, d) => Operations.PackFromVector4(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void PackFromScaledVector4(int count) + { + Vector4[] source = CreateVector4TestData(count); + TPixel[] expected = CreateScaledExpectedPixelData(source); + + TestOperation( + source, + expected, + (s, d) => Operations.PackFromScaledVector4(s, d.GetSpan(), count) + ); + } + + internal static Vector4[] CreateExpectedVector4Data(TPixel[] source) + { + var expected = new Vector4[source.Length]; + + for (int i = 0; i < expected.Length; i++) + { + expected[i] = source[i].ToVector4(); + } + return expected; + } + + internal static Vector4[] CreateExpectedScaledVector4Data(TPixel[] source) + { + var expected = new Vector4[source.Length]; + + for (int i = 0; i < expected.Length; i++) + { + expected[i] = source[i].ToScaledVector4(); + } + return expected; + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void ToVector4(int count) + { + TPixel[] source = CreatePixelTestData(count); + Vector4[] expected = CreateExpectedVector4Data(source); + + TestOperation( + source, + expected, + (s, d) => Operations.ToVector4(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void ToScaledVector4(int count) + { + TPixel[] source = CreateScaledPixelTestData(count); + Vector4[] expected = CreateExpectedScaledVector4Data(source); + + TestOperation( + source, + expected, + (s, d) => Operations.ToScaledVector4(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void PackFromRgb24Bytes(int count) + { + byte[] source = CreateByteTestData(count * 3); + var expected = new TPixel[count]; + + for (int i = 0; i < count; i++) + { + int i3 = i * 3; + + expected[i].PackFromRgba32(new Rgba32(source[i3 + 0], source[i3 + 1], source[i3 + 2], 255)); + } + + TestOperation( + source, + expected, + (s, d) => Operations.PackFromRgb24Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void ToRgb24Bytes(int count) + { + TPixel[] source = CreatePixelTestData(count); + byte[] expected = new byte[count * 3]; + var rgb = default(Rgb24); + + for (int i = 0; i < count; i++) + { + int i3 = i * 3; + source[i].ToRgb24(ref rgb); + expected[i3] = rgb.R; + expected[i3 + 1] = rgb.G; + expected[i3 + 2] = rgb.B; + } + + TestOperation( + source, + expected, + (s, d) => Operations.ToRgb24Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void PackFromRgba32Bytes(int count) + { + byte[] source = CreateByteTestData(count * 4); + var expected = new TPixel[count]; + + for (int i = 0; i < count; i++) + { + int i4 = i * 4; + + expected[i].PackFromRgba32(new Rgba32(source[i4 + 0], source[i4 + 1], source[i4 + 2], source[i4 + 3])); + } + + TestOperation( + source, + expected, + (s, d) => Operations.PackFromRgba32Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void ToRgba32Bytes(int count) + { + TPixel[] source = CreatePixelTestData(count); + byte[] expected = new byte[count * 4]; + var rgba = default(Rgba32); + + for (int i = 0; i < count; i++) + { + int i4 = i * 4; + source[i].ToRgba32(ref rgba); + expected[i4] = rgba.R; + expected[i4 + 1] = rgba.G; + expected[i4 + 2] = rgba.B; + expected[i4 + 3] = rgba.A; + } + + TestOperation( + source, + expected, + (s, d) => Operations.ToRgba32Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void PackFromRgb48Bytes(int count) + { + byte[] source = CreateByteTestData(count * 6); + Span sourceSpan = source.AsSpan(); + var expected = new TPixel[count]; + + var rgba64 = new Rgba64(0, 0, 0, 65535); + for (int i = 0; i < count; i++) + { + int i6 = i * 6; + rgba64.Rgb = MemoryMarshal.Cast(sourceSpan.Slice(i6, 6))[0]; + expected[i].PackFromRgba64(rgba64); + } + + TestOperation( + source, + expected, + (s, d) => Operations.PackFromRgb48Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void ToRgb48Bytes(int count) + { + TPixel[] source = CreatePixelTestData(count); + byte[] expected = new byte[count * 6]; + Rgb48 rgb = default; + + for (int i = 0; i < count; i++) + { + int i6 = i * 6; + source[i].ToRgb48(ref rgb); + Rgba64Bytes rgb48Bytes = Unsafe.As(ref rgb); + expected[i6] = rgb48Bytes[0]; + expected[i6 + 1] = rgb48Bytes[1]; + expected[i6 + 2] = rgb48Bytes[2]; + expected[i6 + 3] = rgb48Bytes[3]; + expected[i6 + 4] = rgb48Bytes[4]; + expected[i6 + 5] = rgb48Bytes[5]; + } + + TestOperation( + source, + expected, + (s, d) => Operations.ToRgb48Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void PackFromRgba64Bytes(int count) + { + byte[] source = CreateByteTestData(count * 8); + Span sourceSpan = source.AsSpan(); + var expected = new TPixel[count]; + + for (int i = 0; i < count; i++) + { + int i8 = i * 8; + expected[i].PackFromRgba64(MemoryMarshal.Cast(sourceSpan.Slice(i8, 8))[0]); + } + + TestOperation( + source, + expected, + (s, d) => Operations.PackFromRgba64Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void ToRgba64Bytes(int count) + { + TPixel[] source = CreatePixelTestData(count); + byte[] expected = new byte[count * 8]; + Rgba64 rgba = default; + + for (int i = 0; i < count; i++) + { + int i8 = i * 8; + source[i].ToRgba64(ref rgba); + Rgba64Bytes rgba64Bytes = Unsafe.As(ref rgba); + expected[i8] = rgba64Bytes[0]; + expected[i8 + 1] = rgba64Bytes[1]; + expected[i8 + 2] = rgba64Bytes[2]; + expected[i8 + 3] = rgba64Bytes[3]; + expected[i8 + 4] = rgba64Bytes[4]; + expected[i8 + 5] = rgba64Bytes[5]; + expected[i8 + 6] = rgba64Bytes[6]; + expected[i8 + 7] = rgba64Bytes[7]; + } + + TestOperation( + source, + expected, + (s, d) => Operations.ToRgba64Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void PackFromBgr24Bytes(int count) + { + byte[] source = CreateByteTestData(count * 3); + var expected = new TPixel[count]; + + for (int i = 0; i < count; i++) + { + int i3 = i * 3; + + expected[i].PackFromRgba32(new Rgba32(source[i3 + 2], source[i3 + 1], source[i3 + 0], 255)); + } + + TestOperation( + source, + expected, + (s, d) => Operations.PackFromBgr24Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void ToBgr24Bytes(int count) + { + TPixel[] source = CreatePixelTestData(count); + byte[] expected = new byte[count * 3]; + var bgr = default(Bgr24); + + for (int i = 0; i < count; i++) + { + int i3 = i * 3; + source[i].ToBgr24(ref bgr); + expected[i3] = bgr.B; + expected[i3 + 1] = bgr.G; + expected[i3 + 2] = bgr.R; + } + + TestOperation( + source, + expected, + (s, d) => Operations.ToBgr24Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void PackFromBgra32Bytes(int count) + { + byte[] source = CreateByteTestData(count * 4); + var expected = new TPixel[count]; + + for (int i = 0; i < count; i++) + { + int i4 = i * 4; + + expected[i].PackFromRgba32(new Rgba32(source[i4 + 2], source[i4 + 1], source[i4 + 0], source[i4 + 3])); + } + + TestOperation( + source, + expected, + (s, d) => Operations.PackFromBgra32Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void ToZyxwBytes(int count) + { + TPixel[] source = CreatePixelTestData(count); + byte[] expected = new byte[count * 4]; + var bgra = default(Bgra32); + + for (int i = 0; i < count; i++) + { + int i4 = i * 4; + source[i].ToBgra32(ref bgra); + expected[i4] = bgra.B; + expected[i4 + 1] = bgra.G; + expected[i4 + 2] = bgra.R; + expected[i4 + 3] = bgra.A; + } + + TestOperation( + source, + expected, + (s, d) => Operations.ToBgra32Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void PackFromArgb32Bytes(int count) + { + byte[] source = CreateByteTestData(count * 4); + var expected = new TPixel[count]; + + for (int i = 0; i < count; i++) + { + int i4 = i * 4; + + expected[i].PackFromRgba32(new Rgba32(source[i4 + 1], source[i4 + 2], source[i4 + 3], source[i4 + 0])); + } + + TestOperation( + source, + expected, + (s, d) => Operations.PackFromArgb32Bytes(s, d.GetSpan(), count) + ); + } + + [Theory] + [MemberData(nameof(ArraySizesData))] + public void ToArgb32Bytes(int count) + { + TPixel[] source = CreatePixelTestData(count); + byte[] expected = new byte[count * 4]; + var argb = default(Argb32); + + for (int i = 0; i < count; i++) + { + int i4 = i * 4; + source[i].ToArgb32(ref argb); + expected[i4] = argb.A; + expected[i4 + 1] = argb.R; + expected[i4 + 2] = argb.G; + expected[i4 + 3] = argb.B; + } + + TestOperation( + source, + expected, + (s, d) => Operations.ToArgb32Bytes(s, d.GetSpan(), count) + ); + } + + private class TestBuffers : IDisposable + where TSource : struct + where TDest : struct + { + public TSource[] SourceBuffer { get; } + public IMemoryOwner ActualDestBuffer { get; } + public TDest[] ExpectedDestBuffer { get; } + + public TestBuffers(TSource[] source, TDest[] expectedDest) + { + this.SourceBuffer = source; + this.ExpectedDestBuffer = expectedDest; + this.ActualDestBuffer = Configuration.Default.MemoryAllocator.Allocate(expectedDest.Length); + } + + public void Dispose() + { + this.ActualDestBuffer.Dispose(); + } + + private const float Tolerance = 0.0001f; + + public void Verify() + { + int count = this.ExpectedDestBuffer.Length; + + if (typeof(TDest) == typeof(Vector4)) + { + + Span expected = MemoryMarshal.Cast(this.ExpectedDestBuffer.AsSpan()); + Span actual = MemoryMarshal.Cast(this.ActualDestBuffer.GetSpan()); + + for (int i = 0; i < count; i++) + { + // ReSharper disable PossibleNullReferenceException + Assert.Equal(expected[i], actual[i], new ApproximateFloatComparer(0.001f)); + // ReSharper restore PossibleNullReferenceException + } + } + else + { + Span expected = this.ExpectedDestBuffer.AsSpan(); + Span actual = this.ActualDestBuffer.GetSpan(); + for (int i = 0; i < count; i++) + { + Assert.Equal(expected[i], actual[i]); + } + } + } + } + + internal static void TestOperation( + TSource[] source, + TDest[] expected, + Action> action) + where TSource : struct + where TDest : struct + { + using (var buffers = new TestBuffers(source, expected)) + { + action(buffers.SourceBuffer, buffers.ActualDestBuffer); + buffers.Verify(); + } + } + + internal static Vector4[] CreateVector4TestData(int length) + { + var result = new Vector4[length]; + var rnd = new Random(42); // Deterministic random values + + for (int i = 0; i < result.Length; i++) + { + result[i] = GetVector(rnd); + } + return result; + } + + internal static TPixel[] CreatePixelTestData(int length) + { + var result = new TPixel[length]; + + var rnd = new Random(42); // Deterministic random values + + for (int i = 0; i < result.Length; i++) + { + Vector4 v = GetVector(rnd); + result[i].PackFromVector4(v); + } + + return result; + } + + internal static TPixel[] CreateScaledPixelTestData(int length) + { + var result = new TPixel[length]; + + var rnd = new Random(42); // Deterministic random values + + for (int i = 0; i < result.Length; i++) + { + Vector4 v = GetVector(rnd); + result[i].PackFromScaledVector4(v); + } + + return result; + } + + internal static byte[] CreateByteTestData(int length) + { + byte[] result = new byte[length]; + var rnd = new Random(42); // Deterministic random values + + for (int i = 0; i < result.Length; i++) + { + result[i] = (byte)rnd.Next(255); + } + return result; + } + + internal static Vector4 GetVector(Random rnd) + { + return new Vector4( + (float)rnd.NextDouble(), + (float)rnd.NextDouble(), + (float)rnd.NextDouble(), + (float)rnd.NextDouble() + ); + } + + [StructLayout(LayoutKind.Sequential)] + private unsafe struct Rgba64Bytes + { + public fixed byte Data[8]; + + public byte this[int idx] + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get + { + ref byte self = ref Unsafe.As(ref this); + return Unsafe.Add(ref self, idx); + } + } + } + } } \ No newline at end of file From af7d96d21462e6e080488f7d1933d1430834fadd Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Mon, 15 Oct 2018 01:11:59 +0200 Subject: [PATCH 02/22] SIMD byte -> float conversion: BulkConvertByteToNormalizedFloatFast --- src/ImageSharp/Common/Extensions/SimdUtils.cs | 45 +++++++++++++++++-- .../Color/Bulk/ToVector4.cs | 16 +++++-- .../ImageSharp.Benchmarks.csproj | 2 +- .../PixelFormats/PixelOperationsTests.cs | 24 ++++++++++ .../Tests/TestEnvironmentTests.cs | 2 + 5 files changed, 81 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.cs b/src/ImageSharp/Common/Extensions/SimdUtils.cs index db1e80dda..56118a764 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.cs +++ b/src/ImageSharp/Common/Extensions/SimdUtils.cs @@ -105,13 +105,50 @@ namespace SixLabors.ImageSharp } } - internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) + /// + /// Fast -> conversion for RyuJIT runtimes having dotnet/coreclr#10662 merged. + /// + /// https://github.com/dotnet/coreclr/pull/10662 + /// + /// + internal static void BulkConvertByteToNormalizedFloatFast(ReadOnlySpan source, Span dest) { - if (!Vector.IsHardwareAccelerated) + Guard.IsTrue( + source.Length % Vector.Count == 0, + nameof(source), + "dest.Length should be divisable by Vector.Count!"); + + int n = source.Length / Vector.Count; + + ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) { - throw new InvalidOperationException( - "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!"); + Vector b = Unsafe.Add(ref sourceBase, i); + + Vector.Widen(b, out Vector s0, out Vector s1); + Vector.Widen(s0, out Vector w0, out Vector w1); + Vector.Widen(s1, out Vector w2, out Vector w3); + + Vector f0 = Vector.ConvertToSingle(w0) * scale; + Vector f1 = Vector.ConvertToSingle(w1) * scale; + Vector f2 = Vector.ConvertToSingle(w2) * scale; + Vector f3 = Vector.ConvertToSingle(w3) * scale; + + ref Vector d = ref Unsafe.Add(ref destBase, i * 4); + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; } + } + + internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) + { + GuardAvx2(nameof(BulkConvertByteToNormalizedFloat)); DebugGuard.IsTrue((dest.Length % Vector.Count) == 0, nameof(source), "dest.Length should be divisable by Vector.Count!"); diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 7b6f902d8..0e5e9d94f 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -28,7 +28,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Params( //64, - 1024)] + 2048)] public int Count { get; set; } [GlobalSetup] @@ -72,14 +72,14 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } [CoreJob] - //[ClrJob] + [ClrJob] public class ToVector4_Rgba32 : ToVector4 { class Config : ManualConfig { } - [Benchmark] + //[Benchmark] public void BulkConvertByteToNormalizedFloat() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -87,5 +87,15 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } + + [Benchmark] + public void BulkConvertByteToNormalizedFloatFast() + { + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + + SimdUtils.BulkConvertByteToNormalizedFloatFast(sBytes, dFloats); + } + } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj b/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj index 36b7d4db4..e470e7821 100644 --- a/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj +++ b/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj @@ -1,6 +1,6 @@  - netcoreapp2.0;net461 + netcoreapp2.1;net461 Exe True SixLabors.ImageSharp.Benchmarks diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs index a96da03e7..2e84886c0 100644 --- a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs +++ b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs @@ -73,6 +73,30 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats ); } + [Fact] + public void BulkConvertByteToNormalizedFloatFast() + { + if (!Vector.IsHardwareAccelerated) + { + return; + } + + ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(128); + Vector4[] expected = CreateExpectedVector4Data(source); + + TestOperation( + source, + expected, + (s, d) => + { + ReadOnlySpan sBytes = MemoryMarshal.Cast(s); + Span dFloats = MemoryMarshal.Cast(d.Memory.Span); + + SimdUtils.BulkConvertByteToNormalizedFloatFast(sBytes, dFloats); + } + ); + } + // [Fact] // Profiling benchmark - enable manually! #pragma warning disable xUnit1013 // Public method should be marked as test diff --git a/tests/ImageSharp.Tests/TestUtilities/Tests/TestEnvironmentTests.cs b/tests/ImageSharp.Tests/TestUtilities/Tests/TestEnvironmentTests.cs index 8a3e69059..30bb16c2a 100644 --- a/tests/ImageSharp.Tests/TestUtilities/Tests/TestEnvironmentTests.cs +++ b/tests/ImageSharp.Tests/TestUtilities/Tests/TestEnvironmentTests.cs @@ -3,6 +3,8 @@ using System; using System.IO; +using System.Reflection; +using System.Runtime.InteropServices; using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Formats; From 281c52786aff178f33ae5a928d4caff7b614a9ca Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Mon, 15 Oct 2018 01:30:35 +0200 Subject: [PATCH 03/22] move tests --- src/ImageSharp/Common/Extensions/SimdUtils.cs | 2 +- .../Color/Bulk/ToVector4.cs | 2 +- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 40 ++++++++++++++++ .../PixelFormats/PixelOperationsTests.cs | 48 ------------------- .../TestUtilities/TestDataGenerator.cs | 7 +++ 5 files changed, 49 insertions(+), 50 deletions(-) diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.cs b/src/ImageSharp/Common/Extensions/SimdUtils.cs index 56118a764..481e0726d 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.cs +++ b/src/ImageSharp/Common/Extensions/SimdUtils.cs @@ -111,7 +111,7 @@ namespace SixLabors.ImageSharp /// https://github.com/dotnet/coreclr/pull/10662 /// /// - internal static void BulkConvertByteToNormalizedFloatFast(ReadOnlySpan source, Span dest) + internal static void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(ReadOnlySpan source, Span dest) { Guard.IsTrue( source.Length % Vector.Count == 0, diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 0e5e9d94f..3ea256e85 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -94,7 +94,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - SimdUtils.BulkConvertByteToNormalizedFloatFast(sBytes, dFloats); + SimdUtils.BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(sBytes, dFloats); } } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index c6c3b68f3..4e39af70f 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -186,6 +186,46 @@ namespace SixLabors.ImageSharp.Tests.Common Assert.Equal(expected, dest); } + [Theory] + [InlineData(1, 0)] + [InlineData(2, 32)] + [InlineData(3, 128)] + public void BulkConvertByteToNormalizedFloat(int seed, int count) + { + if (this.SkipOnNonAvx2()) + { + return; + } + + byte[] source = new Random(seed).GenerateRandomByteArray(count); + float[] result = new float[count]; + float[] expected = source.Select(b => (float)b / 255f).ToArray(); + + SimdUtils.BulkConvertByteToNormalizedFloat(source, result); + + Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); + } + + [Theory] + [InlineData(1, 0)] + [InlineData(2, 32)] + [InlineData(3, 128)] + public void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(int seed, int count) + { + if (!Vector.IsHardwareAccelerated) + { + return; + } + + byte[] source = new Random(seed).GenerateRandomByteArray(count); + float[] result = new float[count]; + float[] expected = source.Select(b => (float)b / 255f).ToArray(); + + SimdUtils.BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(source, result); + + Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); + } + [Theory] [InlineData(0)] [InlineData(7)] diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs index 2e84886c0..4d7ec71e7 100644 --- a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs +++ b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs @@ -49,54 +49,6 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats ); } - [Fact] - public void BulkConvertByteToNormalizedFloat() - { - if (!Vector.IsHardwareAccelerated) - { - return; - } - - ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(64); - Vector4[] expected = CreateExpectedVector4Data(source); - - TestOperation( - source, - expected, - (s, d) => - { - ReadOnlySpan sBytes = MemoryMarshal.Cast(s); - Span dFloats = MemoryMarshal.Cast(d.Memory.Span); - - SimdUtils.BulkConvertByteToNormalizedFloat(sBytes, dFloats); - } - ); - } - - [Fact] - public void BulkConvertByteToNormalizedFloatFast() - { - if (!Vector.IsHardwareAccelerated) - { - return; - } - - ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(128); - Vector4[] expected = CreateExpectedVector4Data(source); - - TestOperation( - source, - expected, - (s, d) => - { - ReadOnlySpan sBytes = MemoryMarshal.Cast(s); - Span dFloats = MemoryMarshal.Cast(d.Memory.Span); - - SimdUtils.BulkConvertByteToNormalizedFloatFast(sBytes, dFloats); - } - ); - } - // [Fact] // Profiling benchmark - enable manually! #pragma warning disable xUnit1013 // Public method should be marked as test diff --git a/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs b/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs index 0b1b89cc0..6f3b18e1f 100644 --- a/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs +++ b/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs @@ -46,6 +46,13 @@ namespace SixLabors.ImageSharp.Tests return values; } + public static byte[] GenerateRandomByteArray(this Random rnd, int length) + { + byte[] values = new byte[length]; + rnd.NextBytes(values); + return values; + } + private static float GetRandomFloat(Random rnd, float minVal, float maxVal) { return (float)rnd.NextDouble() * (maxVal - minVal) + minVal; From 3e5325e2b9e580a2617a36c8c3bbacef679f6de4 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Tue, 16 Oct 2018 00:35:45 +0200 Subject: [PATCH 04/22] uniformize conversion code --- .../SimdUtils.ExtendedIntrinsics.cs | 64 ++++++++++ src/ImageSharp/Common/Extensions/SimdUtils.cs | 50 ++------ .../PixelFormats/Rgba32.PixelOperations.cs | 117 +++--------------- .../Color/Bulk/PackFromVector4.cs | 4 +- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 6 +- .../PixelFormats/PixelOperationsTests.cs | 38 ++---- 6 files changed, 103 insertions(+), 176 deletions(-) create mode 100644 src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs new file mode 100644 index 000000000..ec52b90ef --- /dev/null +++ b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs @@ -0,0 +1,64 @@ +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + /// + /// Methods accelerated only in RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+) + /// PR: + /// https://github.com/dotnet/coreclr/pull/10662 + /// API Proposal: + /// https://github.com/dotnet/corefx/issues/15957 + /// + public static class ExtendedIntrinsics + { + public static bool IsAvailable { get; } = +#if NETCOREAPP2_1 +// TODO: Add a build target for .NET 4.7.2 + true; +#else + false; +#endif + + // ReSharper disable once MemberHidesStaticFromOuterClass + internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) + { + Guard.IsTrue( + source.Length % Vector.Count == 0, + nameof(source), + "dest.Length should be divisable by Vector.Count!"); + + int n = source.Length / Vector.Count; + + ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) + { + Vector b = Unsafe.Add(ref sourceBase, i); + + Vector.Widen(b, out Vector s0, out Vector s1); + Vector.Widen(s0, out Vector w0, out Vector w1); + Vector.Widen(s1, out Vector w2, out Vector w3); + + Vector f0 = Vector.ConvertToSingle(w0) * scale; + Vector f1 = Vector.ConvertToSingle(w1) * scale; + Vector f2 = Vector.ConvertToSingle(w2) * scale; + Vector f3 = Vector.ConvertToSingle(w3) * scale; + + ref Vector d = ref Unsafe.Add(ref destBase, i * 4); + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; + } + } + } + } +} diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.cs b/src/ImageSharp/Common/Extensions/SimdUtils.cs index 481e0726d..3630ede32 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.cs +++ b/src/ImageSharp/Common/Extensions/SimdUtils.cs @@ -14,12 +14,12 @@ namespace SixLabors.ImageSharp /// /// Various extension and utility methods for and utilizing SIMD capabilities /// - internal static class SimdUtils + internal static partial class SimdUtils { /// /// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte. /// - public static bool IsAvx2CompatibleArchitecture => Vector.Count == 8 && Vector.Count == 8; + public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8; internal static void GuardAvx2(string operation) { @@ -61,7 +61,8 @@ namespace SixLabors.ImageSharp /// /// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of values. - /// The values gonna be scaled up into [0-255] and rounded. + /// The values are scaled up into [0-255] and rounded. + /// The implementation is SIMD optimized and works only with `source.Length` divisible by . /// Based on: /// /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions @@ -106,46 +107,13 @@ namespace SixLabors.ImageSharp } /// - /// Fast -> conversion for RyuJIT runtimes having dotnet/coreclr#10662 merged. + /// Converts `dest.Length` bytes to -s to -s normalized into [0..1] + /// The implementation is SIMD optimized and works only with `dest.Length` divisible by . + /// Implementation adapted from: /// - /// https://github.com/dotnet/coreclr/pull/10662 + /// http://stackoverflow.com/a/5362789 /// /// - internal static void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(ReadOnlySpan source, Span dest) - { - Guard.IsTrue( - source.Length % Vector.Count == 0, - nameof(source), - "dest.Length should be divisable by Vector.Count!"); - - int n = source.Length / Vector.Count; - - ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); - ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - - var scale = new Vector(1f / 255f); - - for (int i = 0; i < n; i++) - { - Vector b = Unsafe.Add(ref sourceBase, i); - - Vector.Widen(b, out Vector s0, out Vector s1); - Vector.Widen(s0, out Vector w0, out Vector w1); - Vector.Widen(s1, out Vector w2, out Vector w3); - - Vector f0 = Vector.ConvertToSingle(w0) * scale; - Vector f1 = Vector.ConvertToSingle(w1) * scale; - Vector f2 = Vector.ConvertToSingle(w2) * scale; - Vector f3 = Vector.ConvertToSingle(w3) * scale; - - ref Vector d = ref Unsafe.Add(ref destBase, i * 4); - d = f0; - Unsafe.Add(ref d, 1) = f1; - Unsafe.Add(ref d, 2) = f2; - Unsafe.Add(ref d, 3) = f3; - } - } - internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { GuardAvx2(nameof(BulkConvertByteToNormalizedFloat)); @@ -188,7 +156,7 @@ namespace SixLabors.ImageSharp /// internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) { - GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); + GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows)); DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!"); diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs index 76e119ba4..6745079da 100644 --- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs @@ -3,7 +3,6 @@ using System; using System.Numerics; -using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using SixLabors.Memory; @@ -19,99 +18,37 @@ namespace SixLabors.ImageSharp.PixelFormats /// internal partial class PixelOperations : PixelOperations { - /// - /// SIMD optimized bulk implementation of - /// that works only with `count` divisible by . - /// - /// The to the source colors. - /// The to the dstination vectors. - /// The number of pixels to convert. - /// - /// Implementation adapted from: - /// - /// http://stackoverflow.com/a/5362789 - /// - /// TODO: We can replace this implementation in the future using new Vector API-s: - /// - /// https://github.com/dotnet/corefx/issues/15957 - /// - /// - internal static void ToVector4SimdAligned(ReadOnlySpan sourceColors, Span destVectors, int count) - { - if (!Vector.IsHardwareAccelerated) - { - throw new InvalidOperationException( - "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!"); - } - - DebugGuard.IsTrue( - count % Vector.Count == 0, - nameof(count), - "Argument 'count' should divisible by Vector.Count!"); - - var bVec = new Vector(256.0f / 255.0f); - var magicFloat = new Vector(32768.0f); - var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f - var mask = new Vector(255); - - int unpackedRawCount = count * 4; - - ref uint sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(sourceColors)); - ref WideRgba destBaseAsWide = ref Unsafe.As(ref MemoryMarshal.GetReference(destVectors)); - ref Vector destBaseAsUInt = ref Unsafe.As>(ref destBaseAsWide); - ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWide); - - for (int i = 0; i < count; i++) - { - uint sVal = Unsafe.Add(ref sourceBase, i); - ref WideRgba dst = ref Unsafe.Add(ref destBaseAsWide, i); - - // This call is the bottleneck now: - dst.Load(sVal); - } - - int numOfVectors = unpackedRawCount / Vector.Count; - - for (int i = 0; i < numOfVectors; i++) - { - Vector vi = Unsafe.Add(ref destBaseAsUInt, i); - - vi &= mask; - vi |= magicInt; - - var vf = Vector.AsVectorSingle(vi); - vf = (vf - magicFloat) * bVec; - - Unsafe.Add(ref destBaseAsFloat, i) = vf; - } - } - /// internal override void ToVector4(ReadOnlySpan sourceColors, Span destinationVectors, int count) { Guard.MustBeSizedAtLeast(sourceColors, count, nameof(sourceColors)); Guard.MustBeSizedAtLeast(destinationVectors, count, nameof(destinationVectors)); - if (count < 256 || !Vector.IsHardwareAccelerated) + if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) { // Doesn't worth to bother with SIMD: base.ToVector4(sourceColors, destinationVectors, count); return; } - int remainder = count % Vector.Count; + int remainder = count % 2; int alignedCount = count - remainder; if (alignedCount > 0) { - ToVector4SimdAligned(sourceColors, destinationVectors, alignedCount); + ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); + Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); + + SimdUtils.BulkConvertByteToNormalizedFloat( + rawSrc, + rawDest); } if (remainder > 0) { - sourceColors = sourceColors.Slice(alignedCount); - destinationVectors = destinationVectors.Slice(alignedCount); - base.ToVector4(sourceColors, destinationVectors, remainder); + // actually: remainder == 1 + int lastIdx = count - 1; + destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4(); } } @@ -120,7 +57,7 @@ namespace SixLabors.ImageSharp.PixelFormats { GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count); - if (!SimdUtils.IsAvx2CompatibleArchitecture) + if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) { base.PackFromVector4(sourceVectors, destinationColors, count); return; @@ -131,10 +68,10 @@ namespace SixLabors.ImageSharp.PixelFormats if (alignedCount > 0) { - ReadOnlySpan flatSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount)); - Span flatDest = MemoryMarshal.Cast(destinationColors); + ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount)); + Span rawDest = MemoryMarshal.Cast(destinationColors); - SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(flatSrc, flatDest); + SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); } if (remainder > 0) @@ -172,30 +109,6 @@ namespace SixLabors.ImageSharp.PixelFormats sourcePixels.Slice(0, count).CopyTo(dest); } - - /// - /// Value type to store -s widened into multiple -s. - /// - [StructLayout(LayoutKind.Sequential)] - private struct WideRgba - { - private uint r; - - private uint g; - - private uint b; - - private uint a; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void Load(uint p) - { - this.r = p; - this.g = p >> GreenShift; - this.b = p >> BlueShift; - this.a = p >> AlphaShift; - } - } } } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index a5fa59ba0..bdae7d065 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -23,7 +23,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk private IMemoryOwner destination; - [Params(16, 128, 512)] + [Params( + //64, + 2048)] public int Count { get; set; } [GlobalSetup] diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 4e39af70f..0488dd5e1 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -205,12 +205,12 @@ namespace SixLabors.ImageSharp.Tests.Common Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); } - + [Theory] [InlineData(1, 0)] [InlineData(2, 32)] [InlineData(3, 128)] - public void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(int seed, int count) + public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count) { if (!Vector.IsHardwareAccelerated) { @@ -221,7 +221,7 @@ namespace SixLabors.ImageSharp.Tests.Common float[] result = new float[count]; float[] expected = source.Select(b => (float)b / 255f).ToArray(); - SimdUtils.BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(source, result); + SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result); Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); } diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs index 4d7ec71e7..535952e05 100644 --- a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs +++ b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs @@ -17,43 +17,26 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats { public class Rgba32 : PixelOperationsTests { + public const string SkipProfilingBenchmarks = +#if true + "Profiling benchmark - enable manually!"; +#else + null; +#endif + public Rgba32(ITestOutputHelper output) : base(output) { } - // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class: - public static new TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; - [Fact] public void IsSpecialImplementation() { Assert.IsType(PixelOperations.Instance); } - [Fact] - public void ToVector4SimdAligned() - { - if (!Vector.IsHardwareAccelerated) - { - return; - } - - ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(64); - Vector4[] expected = CreateExpectedVector4Data(source); - - TestOperation( - source, - expected, - (s, d) => ImageSharp.PixelFormats.Rgba32.PixelOperations.ToVector4SimdAligned(s, d.GetSpan(), 64) - ); - } - - - // [Fact] // Profiling benchmark - enable manually! -#pragma warning disable xUnit1013 // Public method should be marked as test + [Fact(Skip = SkipProfilingBenchmarks)] public void Benchmark_ToVector4() -#pragma warning restore xUnit1013 // Public method should be marked as test { int times = 200000; int count = 1024; @@ -73,13 +56,10 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats public class Argb32 : PixelOperationsTests { - // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class: public Argb32(ITestOutputHelper output) : base(output) { } - - public static new TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; } [Theory] @@ -110,7 +90,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats { } - public static TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; + public static TheoryData ArraySizesData => new TheoryData { 0, 1, 2, 7, 16, 1111 }; private static PixelOperations Operations => PixelOperations.Instance; From df87a68555b480e58e886e6f9d2513db29c8d5fd Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Tue, 16 Oct 2018 01:02:39 +0200 Subject: [PATCH 05/22] BulkConvertNormalizedFloatToByteClampOverflows --- .../SimdUtils.ExtendedIntrinsics.cs | 66 +++++++++++++++++-- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 18 +++++ 2 files changed, 79 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs index ec52b90ef..97f364a10 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs @@ -24,6 +24,9 @@ namespace SixLabors.ImageSharp false; #endif + /// + /// A variant of , which is faster on new .NET runtime. + /// // ReSharper disable once MemberHidesStaticFromOuterClass internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { @@ -37,7 +40,7 @@ namespace SixLabors.ImageSharp ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - var scale = new Vector(1f / 255f); + const float Scale = 1f / 255f; for (int i = 0; i < n; i++) { @@ -47,10 +50,10 @@ namespace SixLabors.ImageSharp Vector.Widen(s0, out Vector w0, out Vector w1); Vector.Widen(s1, out Vector w2, out Vector w3); - Vector f0 = Vector.ConvertToSingle(w0) * scale; - Vector f1 = Vector.ConvertToSingle(w1) * scale; - Vector f2 = Vector.ConvertToSingle(w2) * scale; - Vector f3 = Vector.ConvertToSingle(w3) * scale; + Vector f0 = Vector.ConvertToSingle(w0) * Scale; + Vector f1 = Vector.ConvertToSingle(w1) * Scale; + Vector f2 = Vector.ConvertToSingle(w2) * Scale; + Vector f3 = Vector.ConvertToSingle(w3) * Scale; ref Vector d = ref Unsafe.Add(ref destBase, i * 4); d = f0; @@ -59,6 +62,59 @@ namespace SixLabors.ImageSharp Unsafe.Add(ref d, 3) = f3; } } + + /// + /// A variant of , which is faster on new .NET runtime. + /// + // ReSharper disable once MemberHidesStaticFromOuterClass + internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) + { + Guard.IsTrue( + dest.Length % Vector.Count == 0, + nameof(source), + "dest.Length should be divisable by Vector.Count!"); + + int n = dest.Length / Vector.Count; + + ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + for (int i = 0; i < n; i++) + { + ref Vector s = ref Unsafe.Add(ref sourceBase, i * 4); + + Vector f0 = s; + f0 = Clamp(f0); + + Vector f1 = Unsafe.Add(ref s, 1); + f1 = Clamp(f1); + + Vector f2 = Unsafe.Add(ref s, 2); + f2 = Clamp(f2); + + Vector f3 = Unsafe.Add(ref s, 3); + f3 = Clamp(f3); + + Vector w0 = Vector.ConvertToUInt32(f0 * 255f); + Vector w1 = Vector.ConvertToUInt32(f1 * 255f); + Vector w2 = Vector.ConvertToUInt32(f2 * 255f); + Vector w3 = Vector.ConvertToUInt32(f3 * 255f); + + Vector u0 = Vector.Narrow(w0, w1); + Vector u1 = Vector.Narrow(w2, w3); + + Vector b = Vector.Narrow(u0, u1); + + Unsafe.Add(ref destBase, i) = b; + } + + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector Clamp(Vector x) + { + return Vector.Min(Vector.Max(x, Vector.Zero), Vector.One); + } } } } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 0488dd5e1..4b23ca30f 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -226,6 +226,24 @@ namespace SixLabors.ImageSharp.Tests.Common Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); } + [Theory] + [InlineData(1, 0)] + [InlineData(2, 32)] + [InlineData(3, 128)] + public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int seed, int count) + { + float[] orig = new Random(seed).GenerateRandomRoundedFloatArray(count, -50, 444); + float[] normalized = orig.Select(f => f / 255f).ToArray(); + + byte[] dest = new byte[count]; + + SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(normalized, dest); + + byte[] expected = orig.Select(f => (byte)Clamp255(f)).ToArray(); + + Assert.Equal(expected, dest); + } + [Theory] [InlineData(0)] [InlineData(7)] From b8b411bb716664d840d08f96759cec91f4f471d4 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Tue, 16 Oct 2018 01:13:43 +0200 Subject: [PATCH 06/22] disappointing benchmark results --- .../SimdUtils.ExtendedIntrinsics.cs | 5 +- .../Color/Bulk/PackFromVector4.cs | 48 +++++++++++++++++-- .../Color/Bulk/ToVector4.cs | 6 +-- 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs index 97f364a10..90048ca9b 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs @@ -66,6 +66,10 @@ namespace SixLabors.ImageSharp /// /// A variant of , which is faster on new .NET runtime. /// + /// + /// It does NOT worth yet to utilize this method (2018 Oct). + /// See benchmark results for the "PackFromVector4_Rgba32" benchmark! + /// // ReSharper disable once MemberHidesStaticFromOuterClass internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) { @@ -107,7 +111,6 @@ namespace SixLabors.ImageSharp Unsafe.Add(ref destBase, i) = b; } - } [MethodImpl(MethodImplOptions.AggressiveInlining)] diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index bdae7d065..4bf98e5ce 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -3,6 +3,7 @@ // ReSharper disable InconsistentNaming +using System; using System.Buffers; using System.Numerics; using System.Runtime.CompilerServices; @@ -19,9 +20,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk public abstract class PackFromVector4 where TPixel : struct, IPixel { - private IMemoryOwner source; + protected IMemoryOwner source; - private IMemoryOwner destination; + protected IMemoryOwner destination; [Params( //64, @@ -42,7 +43,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk this.source.Dispose(); } - [Benchmark(Baseline = true)] + [Benchmark] public void PerElement() { ref Vector4 s = ref MemoryMarshal.GetReference(this.source.GetSpan()); @@ -54,7 +55,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - [Benchmark] + [Benchmark(Baseline = true)] public void CommonBulk() { new PixelOperations().PackFromVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); @@ -69,6 +70,45 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk public class PackFromVector4_Rgba32 : PackFromVector4 { + //[Benchmark] + public void BulkConvertNormalizedFloatToByteClampOverflows() + { + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + + SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); + } + + [Benchmark] + public void ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows() + { + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + + SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); + } + // RESULTS: + // BenchmarkDotNet=v0.10.14, OS=Windows 10.0.17134 + // Intel Core i7-7700HQ CPU 2.80GHz (Kaby Lake), 1 CPU, 8 logical and 4 physical cores + // Frequency=2742187 Hz, Resolution=364.6724 ns, Timer=TSC + // .NET Core SDK=2.1.400-preview-009063 + // [Host] : .NET Core 2.1.1 (CoreCLR 4.6.26606.02, CoreFX 4.6.26606.05), 64bit RyuJIT + // Job-XIFINS : .NET Framework 4.7.1 (CLR 4.0.30319.42000), 64bit RyuJIT-v4.7.3190.0 + // Job-RTQZPN : .NET Core 2.1.1 (CoreCLR 4.6.26606.02, CoreFX 4.6.26606.05), 64bit RyuJIT + // + // LaunchCount=1 TargetCount=3 WarmupCount=3 + // + // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Allocated | + // ----------------------------------------------------------------- |-------- |------ |----------:|-----------:|----------:|-------:|---------:|----------:| + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3.755 us | 0.8959 us | 0.0506 us | 0.22 | 0.00 | 0 B | + // PerElement | Clr | 2048 | 17.387 us | 15.1569 us | 0.8564 us | 1.02 | 0.04 | 0 B | + // CommonBulk | Clr | 2048 | 17.121 us | 0.7634 us | 0.0431 us | 1.00 | 0.00 | 24 B | + // OptimizedBulk | Clr | 2048 | 4.018 us | 0.3858 us | 0.0218 us | 0.23 | 0.00 | 0 B | + // | | | | | | | | | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 22.232 us | 1.6154 us | 0.0913 us | 1.31 | 0.04 | 0 B | + // PerElement | Core | 2048 | 16.741 us | 2.9254 us | 0.1653 us | 0.98 | 0.03 | 0 B | + // CommonBulk | Core | 2048 | 17.022 us | 11.4894 us | 0.6492 us | 1.00 | 0.00 | 24 B | + // OptimizedBulk | Core | 2048 | 3.707 us | 0.1500 us | 0.0085 us | 0.22 | 0.01 | 0 B | } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 3ea256e85..39c1fbd47 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -79,7 +79,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk { } - //[Benchmark] + [Benchmark] public void BulkConvertByteToNormalizedFloat() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -89,12 +89,12 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } [Benchmark] - public void BulkConvertByteToNormalizedFloatFast() + public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - SimdUtils.BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(sBytes, dFloats); + SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } } From a4714207e03a0a8f92d0ee95751eb8693ef3a14e Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Wed, 17 Oct 2018 23:39:29 +0200 Subject: [PATCH 07/22] todo notes --- .../Common/Extensions/SimdUtils.ExtendedIntrinsics.cs | 1 + tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs index 90048ca9b..fba54b033 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs @@ -69,6 +69,7 @@ namespace SixLabors.ImageSharp /// /// It does NOT worth yet to utilize this method (2018 Oct). /// See benchmark results for the "PackFromVector4_Rgba32" benchmark! + /// TODO: Check again later! /// // ReSharper disable once MemberHidesStaticFromOuterClass internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index 4bf98e5ce..fb505ddcb 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -88,7 +88,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); } + // TODO: Check again later! // RESULTS: + // // BenchmarkDotNet=v0.10.14, OS=Windows 10.0.17134 // Intel Core i7-7700HQ CPU 2.80GHz (Kaby Lake), 1 CPU, 8 logical and 4 physical cores // Frequency=2742187 Hz, Resolution=364.6724 ns, Timer=TSC From 0f4f8227907fd142b092a112737b5ace6c50c21a Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Fri, 19 Oct 2018 13:14:33 +0200 Subject: [PATCH 08/22] cleanup --- .../SimdUtils.ExtendedIntrinsics.cs | 32 +++-- .../{Extensions => Helpers}/SimdUtils.cs | 18 +-- .../PixelFormats/PixelOperations{TPixel}.cs | 56 +++++---- .../PixelFormats/Rgba32.PixelOperations.cs | 69 ++++++++--- .../Color/Bulk/ToVector4.cs | 112 ++++++++++++++++-- .../PixelFormats/PixelOperationsTests.cs | 2 +- 6 files changed, 220 insertions(+), 69 deletions(-) rename src/ImageSharp/Common/{Extensions => Helpers}/SimdUtils.ExtendedIntrinsics.cs (81%) rename src/ImageSharp/Common/{Extensions => Helpers}/SimdUtils.cs (96%) diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs similarity index 81% rename from src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs rename to src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index fba54b033..6def8938a 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -18,7 +18,7 @@ namespace SixLabors.ImageSharp { public static bool IsAvailable { get; } = #if NETCOREAPP2_1 -// TODO: Add a build target for .NET 4.7.2 +// TODO: Also available in .NET 4.7.2, we need to add a build target! true; #else false; @@ -31,14 +31,15 @@ namespace SixLabors.ImageSharp internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { Guard.IsTrue( - source.Length % Vector.Count == 0, + dest.Length % Vector.Count == 0, nameof(source), "dest.Length should be divisable by Vector.Count!"); - int n = source.Length / Vector.Count; + int n = dest.Length / Vector.Count; ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + ref Vector destBaseU = ref Unsafe.As, Vector>(ref destBase); const float Scale = 1f / 255f; @@ -50,16 +51,23 @@ namespace SixLabors.ImageSharp Vector.Widen(s0, out Vector w0, out Vector w1); Vector.Widen(s1, out Vector w2, out Vector w3); - Vector f0 = Vector.ConvertToSingle(w0) * Scale; - Vector f1 = Vector.ConvertToSingle(w1) * Scale; - Vector f2 = Vector.ConvertToSingle(w2) * Scale; - Vector f3 = Vector.ConvertToSingle(w3) * Scale; + ref Vector d = ref Unsafe.Add(ref destBaseU, i * 4); + d = w0; + Unsafe.Add(ref d, 1) = w1; + Unsafe.Add(ref d, 2) = w2; + Unsafe.Add(ref d, 3) = w3; + } + + n = dest.Length / Vector.Count; + + for (int i = 0; i < n; i++) + { + ref Vector df = ref Unsafe.Add(ref destBase, i); + ref Vector du = ref Unsafe.As, Vector>(ref df); - ref Vector d = ref Unsafe.Add(ref destBase, i * 4); - d = f0; - Unsafe.Add(ref d, 1) = f1; - Unsafe.Add(ref d, 2) = f2; - Unsafe.Add(ref d, 3) = f3; + Vector v = Vector.ConvertToSingle(du); + v *= Scale; + df = v; } } diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs similarity index 96% rename from src/ImageSharp/Common/Extensions/SimdUtils.cs rename to src/ImageSharp/Common/Helpers/SimdUtils.cs index 3630ede32..91aed8c79 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -2,13 +2,10 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using SixLabors.ImageSharp.PixelFormats; - namespace SixLabors.ImageSharp { /// @@ -131,23 +128,26 @@ namespace SixLabors.ImageSharp ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); int n = dest.Length / 8; - Octet.OfUInt32 temp = default; for (int i = 0; i < n; i++) { - Octet.OfByte sVal = Unsafe.Add(ref sourceBase, i); + ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); + ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i); + d.LoadFrom(ref s); + } - // This call is the bottleneck now: - temp.LoadFrom(ref sVal); + for (int i = 0; i < n; i++) + { + ref Vector df = ref Unsafe.Add(ref destBaseAsFloat, i); - Vector vi = Unsafe.As>(ref temp); + var vi = Vector.AsVectorUInt32(df); vi &= mask; vi |= magicInt; var vf = Vector.AsVectorSingle(vi); vf = (vf - magicFloat) * bVec; - Unsafe.Add(ref destBaseAsFloat, i) = vf; + df = vf; } } diff --git a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs index b12a2bfa5..39c442fe0 100644 --- a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs +++ b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs @@ -29,17 +29,7 @@ namespace SixLabors.ImageSharp.PixelFormats /// The number of pixels to convert. internal virtual void PackFromVector4(ReadOnlySpan sourceVectors, Span destinationColors, int count) { - GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count); - - ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors); - ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors); - - for (int i = 0; i < count; i++) - { - ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i); - ref TPixel dp = ref Unsafe.Add(ref destRef, i); - dp.PackFromVector4(sp); - } + PackFromVector4Common(sourceVectors, destinationColors, count); } /// @@ -50,17 +40,7 @@ namespace SixLabors.ImageSharp.PixelFormats /// The number of pixels to convert. internal virtual void ToVector4(ReadOnlySpan sourceColors, Span destinationVectors, int count) { - GuardSpans(sourceColors, nameof(sourceColors), destinationVectors, nameof(destinationVectors), count); - - ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors); - ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors); - - for (int i = 0; i < count; i++) - { - ref TPixel sp = ref Unsafe.Add(ref sourceRef, i); - ref Vector4 dp = ref Unsafe.Add(ref destRef, i); - dp = sp.ToVector4(); - } + ToVector4Common(sourceColors, destinationVectors, count); } /// @@ -126,5 +106,37 @@ namespace SixLabors.ImageSharp.PixelFormats Guard.MustBeSizedAtLeast(source, minLength, sourceParamName); Guard.MustBeSizedAtLeast(destination, minLength, destinationParamName); } + + [MethodImpl(InliningOptions.ShortMethod)] + internal static void PackFromVector4Common(ReadOnlySpan sourceVectors, Span destinationColors, int count) + { + GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count); + + ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors); + ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors); + + for (int i = 0; i < count; i++) + { + ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i); + ref TPixel dp = ref Unsafe.Add(ref destRef, i); + dp.PackFromVector4(sp); + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + internal static void ToVector4Common(ReadOnlySpan sourceColors, Span destinationVectors, int count) + { + GuardSpans(sourceColors, nameof(sourceColors), destinationVectors, nameof(destinationVectors), count); + + ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors); + ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors); + + for (int i = 0; i < count; i++) + { + ref TPixel sp = ref Unsafe.Add(ref sourceRef, i); + ref Vector4 dp = ref Unsafe.Add(ref destRef, i); + dp = sp.ToVector4(); + } + } } } \ No newline at end of file diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs index 6745079da..0b96a599b 100644 --- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs @@ -27,28 +27,17 @@ namespace SixLabors.ImageSharp.PixelFormats if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) { // Doesn't worth to bother with SIMD: - base.ToVector4(sourceColors, destinationVectors, count); + ToVector4Common(sourceColors, destinationVectors, count); return; } - int remainder = count % 2; - int alignedCount = count - remainder; - - if (alignedCount > 0) + if (SimdUtils.ExtendedIntrinsics.IsAvailable) { - ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); - Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); - - SimdUtils.BulkConvertByteToNormalizedFloat( - rawSrc, - rawDest); + ConvertToVector4UsingExtendedIntrinsics(sourceColors, destinationVectors, count); } - - if (remainder > 0) + else { - // actually: remainder == 1 - int lastIdx = count - 1; - destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4(); + ConvertToVector4UsingStandardIntrinsics(sourceColors, destinationVectors, count); } } @@ -59,7 +48,7 @@ namespace SixLabors.ImageSharp.PixelFormats if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) { - base.PackFromVector4(sourceVectors, destinationColors, count); + PackFromVector4Common(sourceVectors, destinationColors, count); return; } @@ -109,6 +98,52 @@ namespace SixLabors.ImageSharp.PixelFormats sourcePixels.Slice(0, count).CopyTo(dest); } + + private static void ConvertToVector4UsingExtendedIntrinsics( + ReadOnlySpan sourceColors, + Span destinationVectors, + int count) + { + int remainder = count % 8; + int alignedCount = count - remainder; + + if (alignedCount > 0) + { + ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); + Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); + + SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); + } + + if (remainder > 0) + { + ToVector4Common(sourceColors.Slice(alignedCount), destinationVectors.Slice(alignedCount), remainder); + } + } + + private static void ConvertToVector4UsingStandardIntrinsics( + ReadOnlySpan sourceColors, + Span destinationVectors, + int count) + { + int remainder = count % 2; + int alignedCount = count - remainder; + + if (alignedCount > 0) + { + ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); + Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); + + SimdUtils.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); + } + + if (remainder > 0) + { + // actually: remainder == 1 + int lastIdx = count - 1; + destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4(); + } + } } } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 39c1fbd47..6afd3cf6b 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -6,6 +6,7 @@ using System.Buffers; using System; using System.Numerics; +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using BenchmarkDotNet.Attributes; @@ -28,7 +29,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Params( //64, - 2048)] + //512 + 256 + )] public int Count { get; set; } [GlobalSetup] @@ -45,7 +48,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk this.destination.Dispose(); } - [Benchmark] + //[Benchmark] public void PerElement() { Span s = this.source.GetSpan(); @@ -53,32 +56,48 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk for (int i = 0; i < this.Count; i++) { - TPixel c = s[i]; - d[i] = c.ToVector4(); + d[i] = s[i].ToVector4(); } } - [Benchmark(Baseline = true)] + //[Benchmark] public void CommonBulk() { new PixelOperations().ToVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } - [Benchmark] + //[Benchmark] public void OptimizedBulk() { PixelOperations.Instance.ToVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } } - [CoreJob] - [ClrJob] + [RyuJitX64Job] + [DisassemblyDiagnoser(printAsm: true, printSource: true)] public class ToVector4_Rgba32 : ToVector4 { class Config : ManualConfig { } + [Benchmark(Baseline = true)] + public void FastScalarBulk() + { + ref Rgba32 sBase = ref this.source.GetSpan()[0]; + ref Vector4 dBase = ref this.destination.GetSpan()[0]; + + for (int i = 0; i < this.Count; i++) + { + ref Rgba32 s = ref Unsafe.Add(ref sBase, i); + ref Vector4 d = ref Unsafe.Add(ref dBase, i); + d.X = s.R; + d.Y = s.G; + d.Z = s.B; + d.W = s.A; + } + } + [Benchmark] public void BulkConvertByteToNormalizedFloat() { @@ -97,5 +116,82 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } + //[Benchmark] + public void Original() + { + ToVector4SimdAligned(this.source.GetSpan(), this.destination.GetSpan(), this.Count); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ToVector4SimdAligned(ReadOnlySpan sourceColors, Span destVectors, int count) + { + if (!Vector.IsHardwareAccelerated) + { + throw new InvalidOperationException( + "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!"); + } + + DebugGuard.IsTrue( + count % Vector.Count == 0, + nameof(count), + "Argument 'count' should divisible by Vector.Count!"); + + var bVec = new Vector(256.0f / 255.0f); + var magicFloat = new Vector(32768.0f); + var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f + var mask = new Vector(255); + + int unpackedRawCount = count * 4; + + ref uint sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(sourceColors)); + ref UnpackedRGBA destBaseAsUnpacked = ref Unsafe.As(ref MemoryMarshal.GetReference(destVectors)); + ref Vector destBaseAsUInt = ref Unsafe.As>(ref destBaseAsUnpacked); + ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsUnpacked); + + for (int i = 0; i < count; i++) + { + uint sVal = Unsafe.Add(ref sourceBase, i); + ref UnpackedRGBA dst = ref Unsafe.Add(ref destBaseAsUnpacked, i); + + // This call is the bottleneck now: + dst.Load(sVal); + } + + int numOfVectors = unpackedRawCount / Vector.Count; + + for (int i = 0; i < numOfVectors; i++) + { + Vector vi = Unsafe.Add(ref destBaseAsUInt, i); + + vi &= mask; + vi |= magicInt; + + var vf = Vector.AsVectorSingle(vi); + vf = (vf - magicFloat) * bVec; + + Unsafe.Add(ref destBaseAsFloat, i) = vf; + } + } + + [StructLayout(LayoutKind.Sequential)] + private struct UnpackedRGBA + { + private uint r; + + private uint g; + + private uint b; + + private uint a; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Load(uint p) + { + this.r = p; + this.g = p >> 8; + this.b = p >> 16; + this.a = p >> 24; + } + } } } \ No newline at end of file diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs index 535952e05..abf764881 100644 --- a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs +++ b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs @@ -90,7 +90,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats { } - public static TheoryData ArraySizesData => new TheoryData { 0, 1, 2, 7, 16, 1111 }; + public static TheoryData ArraySizesData => new TheoryData { 0, 1, 2, 7, 16, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 1111 }; private static PixelOperations Operations => PixelOperations.Instance; From 0e06eb635557d3e021d7afee3ee41c1f28a5575d Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Fri, 19 Oct 2018 13:38:03 +0200 Subject: [PATCH 09/22] benchmark conversion steps separately --- .../General/Vectorization/UInt32ToSingle.cs | 66 +++++++++++++++++++ .../Vectorization/WidenBytesToUInt32.cs | 61 +++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs create mode 100644 tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs new file mode 100644 index 000000000..4a4b939b6 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs @@ -0,0 +1,66 @@ +using System.Numerics; +using System.Runtime.CompilerServices; + +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization +{ + public class UInt32ToSingle + { + private float[] data; + + private const int Count = 64; + + [GlobalSetup] + public void Setup() + { + this.data = new float[Count]; + } + + [Benchmark(Baseline = true)] + public void MagicMethod() + { + ref Vector b = ref Unsafe.As>(ref this.data[0]); + + int n = Count / Vector.Count; + + Vector magick = new Vector(32768.0f); + Vector scale = new Vector(255f) / new Vector(256f); + + for (int i = 0; i < n; i++) + { + // union { float f; uint32_t i; } u; + // u.f = 32768.0f + x * (255.0f / 256.0f); + // return (uint8_t)u.i; + + ref Vector d = ref Unsafe.Add(ref b, i); + Vector x = d; + //x = Vector.Max(x, Vector.Zero); + //x = Vector.Min(x, Vector.One); + + x = (x * scale) + magick; + d = x; + } + } + + [Benchmark] + public void StandardSimd() + { + int n = Count / Vector.Count; + + ref Vector b = ref Unsafe.As>(ref this.data[0]); + + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) + { + ref Vector df = ref Unsafe.Add(ref b, i); + Vector du = Unsafe.As, Vector>(ref df); + + Vector v = Vector.ConvertToSingle(du); + v *= scale; + df = v; + } + } + } +} \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs new file mode 100644 index 000000000..f71f6ec1b --- /dev/null +++ b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs @@ -0,0 +1,61 @@ +using System.Numerics; +using System.Runtime.CompilerServices; + +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization +{ + public class WidenBytesToUInt32 + { + private byte[] source; + + private uint[] dest; + + private const int Count = 64; + + [GlobalSetup] + public void Setup() + { + this.source = new byte[Count]; + this.dest = new uint[Count]; + } + + [Benchmark(Baseline = true)] + public void Standard() + { + const int N = Count / 8; + + ref SimdUtils.Octet.OfByte sBase = ref Unsafe.As(ref this.source[0]); + ref SimdUtils.Octet.OfUInt32 dBase = ref Unsafe.As(ref this.dest[0]); + + for (int i = 0; i < N; i++) + { + Unsafe.Add(ref dBase, i).LoadFrom(ref Unsafe.Add(ref sBase, i)); + } + } + + [Benchmark] + public void Simd() + { + int n = Count / Vector.Count; + + ref Vector sBase = ref Unsafe.As>(ref this.source[0]); + ref Vector dBase = ref Unsafe.As>(ref this.dest[0]); + + for (int i = 0; i < n; i++) + { + Vector b = Unsafe.Add(ref sBase, i); + + Vector.Widen(b, out Vector s0, out Vector s1); + Vector.Widen(s0, out Vector w0, out Vector w1); + Vector.Widen(s1, out Vector w2, out Vector w3); + + ref Vector d = ref Unsafe.Add(ref dBase, i * 4); + d = w0; + Unsafe.Add(ref d, 1) = w1; + Unsafe.Add(ref d, 2) = w2; + Unsafe.Add(ref d, 3) = w3; + } + } + } +} \ No newline at end of file From 0f538ff1953b637d01f6c913b068cad763cd9152 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sat, 20 Oct 2018 14:42:23 +0200 Subject: [PATCH 10/22] fixed benchmarks and optimized implementation --- .../Helpers/SimdUtils.ExtendedIntrinsics.cs | 61 +++--- .../Color/Bulk/PackFromVector4.cs | 67 +++--- .../Color/Bulk/ToVector4.cs | 197 +++++++++++++++--- .../General/Vectorization/UInt32ToSingle.cs | 52 +++-- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 6 +- 5 files changed, 279 insertions(+), 104 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index 6def8938a..3131f1873 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -39,9 +39,8 @@ namespace SixLabors.ImageSharp ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - ref Vector destBaseU = ref Unsafe.As, Vector>(ref destBase); - const float Scale = 1f / 255f; + var scale = new Vector(1f / 255f); for (int i = 0; i < n; i++) { @@ -51,26 +50,28 @@ namespace SixLabors.ImageSharp Vector.Widen(s0, out Vector w0, out Vector w1); Vector.Widen(s1, out Vector w2, out Vector w3); - ref Vector d = ref Unsafe.Add(ref destBaseU, i * 4); - d = w0; - Unsafe.Add(ref d, 1) = w1; - Unsafe.Add(ref d, 2) = w2; - Unsafe.Add(ref d, 3) = w3; - } - - n = dest.Length / Vector.Count; + Vector f0 = ConvertToSingle(w0, scale); + Vector f1 = ConvertToSingle(w1, scale); + Vector f2 = ConvertToSingle(w2, scale); + Vector f3 = ConvertToSingle(w3, scale); - for (int i = 0; i < n; i++) - { - ref Vector df = ref Unsafe.Add(ref destBase, i); - ref Vector du = ref Unsafe.As, Vector>(ref df); - - Vector v = Vector.ConvertToSingle(du); - v *= Scale; - df = v; + ref Vector d = ref Unsafe.Add(ref destBase, i * 4); + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector ConvertToSingle(Vector u, Vector scale) + { + Vector vi = Vector.AsVectorInt32(u); + Vector v = Vector.ConvertToSingle(vi); + v *= scale; + return v; + } + /// /// A variant of , which is faster on new .NET runtime. /// @@ -92,26 +93,21 @@ namespace SixLabors.ImageSharp ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + Vector scale = new Vector(255); + for (int i = 0; i < n; i++) { ref Vector s = ref Unsafe.Add(ref sourceBase, i * 4); Vector f0 = s; - f0 = Clamp(f0); - Vector f1 = Unsafe.Add(ref s, 1); - f1 = Clamp(f1); - Vector f2 = Unsafe.Add(ref s, 2); - f2 = Clamp(f2); - Vector f3 = Unsafe.Add(ref s, 3); - f3 = Clamp(f3); - Vector w0 = Vector.ConvertToUInt32(f0 * 255f); - Vector w1 = Vector.ConvertToUInt32(f1 * 255f); - Vector w2 = Vector.ConvertToUInt32(f2 * 255f); - Vector w3 = Vector.ConvertToUInt32(f3 * 255f); + Vector w0 = ConvertToUInt32(f0, scale); + Vector w1 = ConvertToUInt32(f1, scale); + Vector w2 = ConvertToUInt32(f2, scale); + Vector w3 = ConvertToUInt32(f3, scale); Vector u0 = Vector.Narrow(w0, w1); Vector u1 = Vector.Narrow(w2, w3); @@ -123,9 +119,12 @@ namespace SixLabors.ImageSharp } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector Clamp(Vector x) + private static Vector ConvertToUInt32(Vector vf, Vector scale) { - return Vector.Min(Vector.Max(x, Vector.Zero), Vector.One); + vf = Vector.Min(Vector.Max(vf, Vector.Zero), Vector.One); + vf *= scale; + Vector vi = Vector.ConvertToInt32(vf); + return Vector.AsVectorUInt32(vi); } } } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index fb505ddcb..1153d8f40 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -26,7 +26,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Params( //64, - 2048)] + 2048 + )] public int Count { get; set; } [GlobalSetup] @@ -43,7 +44,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk this.source.Dispose(); } - [Benchmark] + //[Benchmark] public void PerElement() { ref Vector4 s = ref MemoryMarshal.GetReference(this.source.GetSpan()); @@ -55,14 +56,14 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - [Benchmark(Baseline = true)] - public void CommonBulk() + [Benchmark] + public void PixelOperations_Base() { new PixelOperations().PackFromVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } [Benchmark] - public void OptimizedBulk() + public void PixelOperations_Specialized() { PixelOperations.Instance.PackFromVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } @@ -70,7 +71,30 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk public class PackFromVector4_Rgba32 : PackFromVector4 { - //[Benchmark] + [Benchmark] + public void FastDefault() + { + ref Vector4 sBase = ref this.source.GetSpan()[0]; + ref Rgba32 dBase = ref this.destination.GetSpan()[0]; + + Vector4 maxBytes = new Vector4(255); + Vector4 half = new Vector4(0.5f); + + for (int i = 0; i < this.Count; i++) + { + Vector4 v = Unsafe.Add(ref sBase, i); + v *= maxBytes; + v += half; + v = Vector4.Clamp(v, Vector4.Zero, maxBytes); + ref Rgba32 d = ref Unsafe.Add(ref dBase, i); + d.R = (byte)v.X; + d.G = (byte)v.Y; + d.B = (byte)v.Z; + d.A = (byte)v.W; + } + } + + [Benchmark(Baseline = true)] public void BulkConvertNormalizedFloatToByteClampOverflows() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -88,29 +112,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); } - // TODO: Check again later! // RESULTS: - // - // BenchmarkDotNet=v0.10.14, OS=Windows 10.0.17134 - // Intel Core i7-7700HQ CPU 2.80GHz (Kaby Lake), 1 CPU, 8 logical and 4 physical cores - // Frequency=2742187 Hz, Resolution=364.6724 ns, Timer=TSC - // .NET Core SDK=2.1.400-preview-009063 - // [Host] : .NET Core 2.1.1 (CoreCLR 4.6.26606.02, CoreFX 4.6.26606.05), 64bit RyuJIT - // Job-XIFINS : .NET Framework 4.7.1 (CLR 4.0.30319.42000), 64bit RyuJIT-v4.7.3190.0 - // Job-RTQZPN : .NET Core 2.1.1 (CoreCLR 4.6.26606.02, CoreFX 4.6.26606.05), 64bit RyuJIT - // - // LaunchCount=1 TargetCount=3 WarmupCount=3 + // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Allocated | + // ----------------------------------------------------------------- |-------- |------ |----------:|----------:|----------:|-------:|---------:|----------:| + // FastDefault | Clr | 2048 | 15.989 us | 6.1384 us | 0.3468 us | 4.07 | 0.08 | 0 B | + // BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3.931 us | 0.6264 us | 0.0354 us | 1.00 | 0.00 | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 2.100 us | 0.4717 us | 0.0267 us | 0.53 | 0.01 | 0 B | // - // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Allocated | - // ----------------------------------------------------------------- |-------- |------ |----------:|-----------:|----------:|-------:|---------:|----------:| - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3.755 us | 0.8959 us | 0.0506 us | 0.22 | 0.00 | 0 B | - // PerElement | Clr | 2048 | 17.387 us | 15.1569 us | 0.8564 us | 1.02 | 0.04 | 0 B | - // CommonBulk | Clr | 2048 | 17.121 us | 0.7634 us | 0.0431 us | 1.00 | 0.00 | 24 B | - // OptimizedBulk | Clr | 2048 | 4.018 us | 0.3858 us | 0.0218 us | 0.23 | 0.00 | 0 B | - // | | | | | | | | | - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 22.232 us | 1.6154 us | 0.0913 us | 1.31 | 0.04 | 0 B | - // PerElement | Core | 2048 | 16.741 us | 2.9254 us | 0.1653 us | 0.98 | 0.03 | 0 B | - // CommonBulk | Core | 2048 | 17.022 us | 11.4894 us | 0.6492 us | 1.00 | 0.00 | 24 B | - // OptimizedBulk | Core | 2048 | 3.707 us | 0.1500 us | 0.0085 us | 0.22 | 0.01 | 0 B | + // | | | | | | | | | + // FastDefault | Core | 2048 | 14.693 us | 0.5131 us | 0.0290 us | 3.76 | 0.03 | 0 B | + // BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 3.913 us | 0.5661 us | 0.0320 us | 1.00 | 0.00 | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 1.966 us | 0.4056 us | 0.0229 us | 0.50 | 0.01 | 0 B | } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 6afd3cf6b..d699d168b 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -29,8 +29,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Params( //64, - //512 - 256 + //256, + //512, + 2048 )] public int Count { get; set; } @@ -60,70 +61,214 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - //[Benchmark] - public void CommonBulk() + [Benchmark] + public void PixelOperations_Base() { new PixelOperations().ToVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } - //[Benchmark] - public void OptimizedBulk() + [Benchmark] + public void PixelOperations_Specialized() { PixelOperations.Instance.ToVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } } - [RyuJitX64Job] - [DisassemblyDiagnoser(printAsm: true, printSource: true)] + [Config(typeof(Config.ShortClr))] public class ToVector4_Rgba32 : ToVector4 { - class Config : ManualConfig - { - } - - [Benchmark(Baseline = true)] - public void FastScalarBulk() + [Benchmark] + public void BasicBulk() { ref Rgba32 sBase = ref this.source.GetSpan()[0]; ref Vector4 dBase = ref this.destination.GetSpan()[0]; + Vector4 scale = new Vector4(1f / 255f); + + Vector4 v = default; + for (int i = 0; i < this.Count; i++) { ref Rgba32 s = ref Unsafe.Add(ref sBase, i); - ref Vector4 d = ref Unsafe.Add(ref dBase, i); - d.X = s.R; - d.Y = s.G; - d.Z = s.B; - d.W = s.A; + v.X = s.R; + v.Y = s.G; + v.Z = s.B; + v.W = s.A; + v *= scale; + Unsafe.Add(ref dBase, i) = v; + } + } + + [Benchmark(Baseline = true)] + public void BulkConvertByteToNormalizedFloat_2Loops() + { + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + + var bVec = new Vector(256.0f / 255.0f); + var magicFloat = new Vector(32768.0f); + var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f + var mask = new Vector(255); + + ref SimdUtils.Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference((ReadOnlySpan)sBytes)); + ref SimdUtils.Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dFloats)); + + ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); + + int n = dFloats.Length / 8; + + for (int i = 0; i < n; i++) + { + ref SimdUtils.Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); + ref SimdUtils.Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i); + d.LoadFrom(ref s); + } + + for (int i = 0; i < n; i++) + { + ref Vector df = ref Unsafe.Add(ref destBaseAsFloat, i); + + var vi = Vector.AsVectorUInt32(df); + vi &= mask; + vi |= magicInt; + + var vf = Vector.AsVectorSingle(vi); + vf = (vf - magicFloat) * bVec; + + df = vf; + } + } + + //[Benchmark] + public void BulkConvertByteToNormalizedFloat_ConvertInSameLoop() + { + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + + var bVec = new Vector(256.0f / 255.0f); + var magicFloat = new Vector(32768.0f); + var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f + var mask = new Vector(255); + + ref SimdUtils.Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference((ReadOnlySpan)sBytes)); + ref SimdUtils.Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dFloats)); + + ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); + + int n = dFloats.Length / 8; + + var temp = default(SimdUtils.Octet.OfUInt32); + ref Vector tempRef = ref Unsafe.As>(ref temp); + + for (int i = 0; i < n; i++) + { + ref SimdUtils.Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); + temp.LoadFrom(ref s); + + Vector vi = tempRef; + + vi &= mask; + vi |= magicInt; + + var vf = Vector.AsVectorSingle(vi); + vf = (vf - magicFloat) * bVec; + + Unsafe.Add(ref destBaseAsFloat, i) = vf; } } [Benchmark] - public void BulkConvertByteToNormalizedFloat() + public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - SimdUtils.BulkConvertByteToNormalizedFloat(sBytes, dFloats); + int n = dFloats.Length / Vector.Count; + + ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference((ReadOnlySpan)sBytes)); + ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dFloats)); + ref Vector destBaseU = ref Unsafe.As, Vector>(ref destBase); + + for (int i = 0; i < n; i++) + { + Vector b = Unsafe.Add(ref sourceBase, i); + + Vector.Widen(b, out Vector s0, out Vector s1); + Vector.Widen(s0, out Vector w0, out Vector w1); + Vector.Widen(s1, out Vector w2, out Vector w3); + + ref Vector d = ref Unsafe.Add(ref destBaseU, i * 4); + d = w0; + Unsafe.Add(ref d, 1) = w1; + Unsafe.Add(ref d, 2) = w2; + Unsafe.Add(ref d, 3) = w3; + } + + n = dFloats.Length / Vector.Count; + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) + { + ref Vector dRef = ref Unsafe.Add(ref destBase, i); + + Vector du = Vector.AsVectorInt32(dRef); + Vector v = Vector.ConvertToSingle(du); + v *= scale; + + dRef = v; + } } [Benchmark] - public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat() + public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats); + int n = dFloats.Length / Vector.Count; + + ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference((ReadOnlySpan)sBytes)); + ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dFloats)); + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) + { + Vector b = Unsafe.Add(ref sourceBase, i); + + Vector.Widen(b, out Vector s0, out Vector s1); + Vector.Widen(s0, out Vector w0, out Vector w1); + Vector.Widen(s1, out Vector w2, out Vector w3); + + Vector f0 = ConvertToNormalizedSingle(w0, scale); + Vector f1 = ConvertToNormalizedSingle(w1, scale); + Vector f2 = ConvertToNormalizedSingle(w2, scale); + Vector f3 = ConvertToNormalizedSingle(w3, scale); + + ref Vector d = ref Unsafe.Add(ref destBase, i * 4); + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector ConvertToNormalizedSingle(Vector u, Vector scale) + { + Vector vi = Vector.AsVectorInt32(u); + Vector v = Vector.ConvertToSingle(vi); + v *= scale; + return v; } //[Benchmark] - public void Original() + public void OldImplementation() { - ToVector4SimdAligned(this.source.GetSpan(), this.destination.GetSpan(), this.Count); + ToVector4OldImplementation(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void ToVector4SimdAligned(ReadOnlySpan sourceColors, Span destVectors, int count) + private static void ToVector4OldImplementation(ReadOnlySpan sourceColors, Span destVectors, int count) { if (!Vector.IsHardwareAccelerated) { diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs index 4a4b939b6..be19e719a 100644 --- a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs +++ b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs @@ -9,7 +9,7 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization { private float[] data; - private const int Count = 64; + private const int Count = 32; [GlobalSetup] public void Setup() @@ -24,8 +24,10 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization int n = Count / Vector.Count; - Vector magick = new Vector(32768.0f); - Vector scale = new Vector(255f) / new Vector(256f); + var bVec = new Vector(256.0f / 255.0f); + var magicFloat = new Vector(32768.0f); + var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f + var mask = new Vector(255); for (int i = 0; i < n; i++) { @@ -33,13 +35,16 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization // u.f = 32768.0f + x * (255.0f / 256.0f); // return (uint8_t)u.i; - ref Vector d = ref Unsafe.Add(ref b, i); - Vector x = d; - //x = Vector.Max(x, Vector.Zero); - //x = Vector.Min(x, Vector.One); + ref Vector df = ref Unsafe.Add(ref b, i); + + var vi = Vector.AsVectorUInt32(df); + vi &= mask; + vi |= magicInt; + + var vf = Vector.AsVectorSingle(vi); + vf = (vf - magicFloat) * bVec; - x = (x * scale) + magick; - d = x; + df = vf; } } @@ -48,18 +53,37 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization { int n = Count / Vector.Count; - ref Vector b = ref Unsafe.As>(ref this.data[0]); + ref Vector bf = ref Unsafe.As>(ref this.data[0]); + ref Vector bu = ref Unsafe.As, Vector>(ref bf); var scale = new Vector(1f / 255f); for (int i = 0; i < n; i++) { - ref Vector df = ref Unsafe.Add(ref b, i); - Vector du = Unsafe.As, Vector>(ref df); + Vector u = Unsafe.Add(ref bu, i); + Vector v = Vector.ConvertToSingle(u); + v *= scale; + Unsafe.Add(ref bf, i) = v; + } + } - Vector v = Vector.ConvertToSingle(du); + // This code is not correct at all, it's just here as reference + [Benchmark] + public void StandardSimdFromInt() + { + int n = Count / Vector.Count; + + ref Vector bf = ref Unsafe.As>(ref this.data[0]); + ref Vector bu = ref Unsafe.As, Vector>(ref bf); + + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) + { + Vector u = Unsafe.Add(ref bu, i); + Vector v = Vector.ConvertToSingle(u); v *= scale; - df = v; + Unsafe.Add(ref bf, i) = v; } } } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 4b23ca30f..7ed18ef86 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -212,15 +212,11 @@ namespace SixLabors.ImageSharp.Tests.Common [InlineData(3, 128)] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count) { - if (!Vector.IsHardwareAccelerated) - { - return; - } - byte[] source = new Random(seed).GenerateRandomByteArray(count); float[] result = new float[count]; float[] expected = source.Select(b => (float)b / 255f).ToArray(); + SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result); Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); From 664d838291b5fc0aa3b975c7749023c69ebc8258 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sat, 20 Oct 2018 19:47:08 +0200 Subject: [PATCH 11/22] fix accuracy issues --- .../Helpers/SimdUtils.ExtendedIntrinsics.cs | 20 +++--- .../PixelFormats/Rgba32.PixelOperations.cs | 57 +++++++++++---- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 70 +++++++++---------- .../TestUtilities/TestDataGenerator.cs | 9 +-- 4 files changed, 91 insertions(+), 65 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index 3131f1873..ec91e5098 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -85,7 +85,7 @@ namespace SixLabors.ImageSharp { Guard.IsTrue( dest.Length % Vector.Count == 0, - nameof(source), + nameof(dest), "dest.Length should be divisable by Vector.Count!"); int n = dest.Length / Vector.Count; @@ -93,8 +93,6 @@ namespace SixLabors.ImageSharp ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - Vector scale = new Vector(255); - for (int i = 0; i < n; i++) { ref Vector s = ref Unsafe.Add(ref sourceBase, i * 4); @@ -104,10 +102,10 @@ namespace SixLabors.ImageSharp Vector f2 = Unsafe.Add(ref s, 2); Vector f3 = Unsafe.Add(ref s, 3); - Vector w0 = ConvertToUInt32(f0, scale); - Vector w1 = ConvertToUInt32(f1, scale); - Vector w2 = ConvertToUInt32(f2, scale); - Vector w3 = ConvertToUInt32(f3, scale); + Vector w0 = ConvertToUInt32(f0); + Vector w1 = ConvertToUInt32(f1); + Vector w2 = ConvertToUInt32(f2); + Vector w3 = ConvertToUInt32(f3); Vector u0 = Vector.Narrow(w0, w1); Vector u1 = Vector.Narrow(w2, w3); @@ -119,10 +117,12 @@ namespace SixLabors.ImageSharp } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector ConvertToUInt32(Vector vf, Vector scale) + private static Vector ConvertToUInt32(Vector vf) { - vf = Vector.Min(Vector.Max(vf, Vector.Zero), Vector.One); - vf *= scale; + Vector maxBytes = new Vector(255f); + vf *= maxBytes; + vf += new Vector(0.5f); + vf = Vector.Min(Vector.Max(vf, Vector.Zero), maxBytes); Vector vi = Vector.ConvertToInt32(vf); return Vector.AsVectorUInt32(vi); } diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs index 0b96a599b..bfef60c60 100644 --- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs @@ -52,22 +52,13 @@ namespace SixLabors.ImageSharp.PixelFormats return; } - int remainder = count % 2; - int alignedCount = count - remainder; - - if (alignedCount > 0) + if (SimdUtils.ExtendedIntrinsics.IsAvailable) { - ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount)); - Span rawDest = MemoryMarshal.Cast(destinationColors); - - SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); + ConvertFromVector4ExtendedIntrinsics(sourceVectors, destinationColors, count); } - - if (remainder > 0) + else { - // actually: remainder == 1 - int lastIdx = count - 1; - destinationColors[lastIdx].PackFromVector4(sourceVectors[lastIdx]); + ConvertFromVector4StandardIntrinsics(sourceVectors, destinationColors, count); } } @@ -144,6 +135,46 @@ namespace SixLabors.ImageSharp.PixelFormats destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4(); } } + + private static void ConvertFromVector4ExtendedIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count) + { + int remainder = count % 8; + int alignedCount = count - remainder; + + if (alignedCount > 0) + { + ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors); + Span rawDest = MemoryMarshal.Cast(destinationColors.Slice(0, alignedCount)); + + SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); + } + + if (remainder > 0) + { + PackFromVector4Common(sourceVectors.Slice(alignedCount), destinationColors.Slice(alignedCount), remainder); + } + } + + private static void ConvertFromVector4StandardIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count) + { + int remainder = count % 2; + int alignedCount = count - remainder; + + if (alignedCount > 0) + { + ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount)); + Span rawDest = MemoryMarshal.Cast(destinationColors); + + SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); + } + + if (remainder > 0) + { + // actually: remainder == 1 + int lastIdx = count - 1; + destinationColors[lastIdx].PackFromVector4(sourceVectors[lastIdx]); + } + } } } } \ No newline at end of file diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 7ed18ef86..4e1717bda 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -160,31 +160,6 @@ namespace SixLabors.ImageSharp.Tests.Common Assert.Equal(expected, dest); } - private static float Clamp255(float x) => Math.Min(255f, Math.Max(0f, x)); - - [Theory] - [InlineData(1, 0)] - [InlineData(1, 8)] - [InlineData(2, 16)] - [InlineData(3, 128)] - public void BulkConvertNormalizedFloatToByteClampOverflows(int seed, int count) - { - if (this.SkipOnNonAvx2()) - { - return; - } - - float[] orig = new Random(seed).GenerateRandomRoundedFloatArray(count, -50, 444); - float[] normalized = orig.Select(f => f / 255f).ToArray(); - - byte[] dest = new byte[count]; - - SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(normalized, dest); - - byte[] expected = orig.Select(f => (byte)Clamp255(f)).ToArray(); - - Assert.Equal(expected, dest); - } [Theory] [InlineData(1, 0)] @@ -222,23 +197,44 @@ namespace SixLabors.ImageSharp.Tests.Common Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); } + + public static readonly TheoryData BulkConvertNormalizedFloatToByteClampOverflows_Data = + new TheoryData + { + 0, 64, 1024 + }; + [Theory] - [InlineData(1, 0)] - [InlineData(2, 32)] - [InlineData(3, 128)] - public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int seed, int count) + [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))] + public void BulkConvertNormalizedFloatToByteClampOverflows(int count) { - float[] orig = new Random(seed).GenerateRandomRoundedFloatArray(count, -50, 444); - float[] normalized = orig.Select(f => f / 255f).ToArray(); + if (this.SkipOnNonAvx2()) + { + return; + } - byte[] dest = new byte[count]; + float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f); + byte[] expected = source.Select(NormalizedFloatToByte).ToArray(); + byte[] actual = new byte[count]; - SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(normalized, dest); + SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(source, actual); - byte[] expected = orig.Select(f => (byte)Clamp255(f)).ToArray(); + Assert.Equal(expected, actual); + } - Assert.Equal(expected, dest); + [Theory] + [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))] + public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) + { + float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f); + byte[] expected = source.Select(NormalizedFloatToByte).ToArray(); + byte[] actual = new byte[count]; + + SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(source, actual); + + Assert.Equal(expected, actual); } + private static byte NormalizedFloatToByte(float f) => (byte)Math.Min(255f, Math.Max(0f, f * 255f + 0.5f)); [Theory] [InlineData(0)] @@ -265,7 +261,7 @@ namespace SixLabors.ImageSharp.Tests.Common float[] source = { 0, 7, 42, 255, 0.5f, 1.1f, 2.6f, 16f }; - var expected = source.Select(f => (byte)Math.Round(f)).ToArray(); + byte[] expected = source.Select(f => (byte)Math.Round(f)).ToArray(); source = source.Select(f => f / 255f).ToArray(); @@ -299,8 +295,6 @@ namespace SixLabors.ImageSharp.Tests.Common iiRef = x; - //Tuple8.OfUInt32 ii = Unsafe.As, Tuple8.OfUInt32>(ref x); - ref Tuple8.OfByte d = ref MemoryMarshal.Cast(dest)[0]; d.LoadFrom(ref ii); diff --git a/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs b/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs index 6f3b18e1f..912b86e34 100644 --- a/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs +++ b/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs @@ -33,19 +33,20 @@ namespace SixLabors.ImageSharp.Tests return values; } - public static float[] GenerateRandomRoundedFloatArray(this Random rnd, int length, int minVal, int maxValExclusive) + public static float[] GenerateRandomRoundedFloatArray(this Random rnd, int length, float minVal, float maxVal) { float[] values = new float[length]; for (int i = 0; i < length; i++) { - int val = rnd.Next(minVal, maxValExclusive); - values[i] = (float)val; + values[i] = (float) Math.Round(rnd.GetRandomFloat(minVal, maxVal)); } return values; } + + public static byte[] GenerateRandomByteArray(this Random rnd, int length) { byte[] values = new byte[length]; @@ -53,7 +54,7 @@ namespace SixLabors.ImageSharp.Tests return values; } - private static float GetRandomFloat(Random rnd, float minVal, float maxVal) + private static float GetRandomFloat(this Random rnd, float minVal, float maxVal) { return (float)rnd.NextDouble() * (maxVal - minVal) + minVal; } From 10afe6572e598274e89ba8851b87732eb2f6d6fc Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sat, 20 Oct 2018 20:05:20 +0200 Subject: [PATCH 12/22] cleanup benchmarks --- .../Color/Bulk/ToVector4.cs | 96 ++----------------- 1 file changed, 10 insertions(+), 86 deletions(-) diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index d699d168b..726e214a9 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -100,84 +100,24 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } [Benchmark(Baseline = true)] - public void BulkConvertByteToNormalizedFloat_2Loops() + public void BulkConvertByteToNormalizedFloat() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - var bVec = new Vector(256.0f / 255.0f); - var magicFloat = new Vector(32768.0f); - var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f - var mask = new Vector(255); - - ref SimdUtils.Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference((ReadOnlySpan)sBytes)); - ref SimdUtils.Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dFloats)); - - ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); - - int n = dFloats.Length / 8; - - for (int i = 0; i < n; i++) - { - ref SimdUtils.Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); - ref SimdUtils.Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i); - d.LoadFrom(ref s); - } - - for (int i = 0; i < n; i++) - { - ref Vector df = ref Unsafe.Add(ref destBaseAsFloat, i); - - var vi = Vector.AsVectorUInt32(df); - vi &= mask; - vi |= magicInt; - - var vf = Vector.AsVectorSingle(vi); - vf = (vf - magicFloat) * bVec; - - df = vf; - } + SimdUtils.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } - //[Benchmark] - public void BulkConvertByteToNormalizedFloat_ConvertInSameLoop() + [Benchmark] + public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - var bVec = new Vector(256.0f / 255.0f); - var magicFloat = new Vector(32768.0f); - var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f - var mask = new Vector(255); - - ref SimdUtils.Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference((ReadOnlySpan)sBytes)); - ref SimdUtils.Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dFloats)); - - ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); - - int n = dFloats.Length / 8; - - var temp = default(SimdUtils.Octet.OfUInt32); - ref Vector tempRef = ref Unsafe.As>(ref temp); - - for (int i = 0; i < n; i++) - { - ref SimdUtils.Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); - temp.LoadFrom(ref s); - - Vector vi = tempRef; - - vi &= mask; - vi |= magicInt; - - var vf = Vector.AsVectorSingle(vi); - vf = (vf - magicFloat) * bVec; - - Unsafe.Add(ref destBaseAsFloat, i) = vf; - } + SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } - [Benchmark] + //[Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -219,7 +159,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - [Benchmark] + //[Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -264,23 +204,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk //[Benchmark] public void OldImplementation() { - ToVector4OldImplementation(this.source.GetSpan(), this.destination.GetSpan(), this.Count); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void ToVector4OldImplementation(ReadOnlySpan sourceColors, Span destVectors, int count) - { - if (!Vector.IsHardwareAccelerated) - { - throw new InvalidOperationException( - "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!"); - } - - DebugGuard.IsTrue( - count % Vector.Count == 0, - nameof(count), - "Argument 'count' should divisible by Vector.Count!"); - + int count = this.Count; var bVec = new Vector(256.0f / 255.0f); var magicFloat = new Vector(32768.0f); var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f @@ -288,8 +212,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk int unpackedRawCount = count * 4; - ref uint sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(sourceColors)); - ref UnpackedRGBA destBaseAsUnpacked = ref Unsafe.As(ref MemoryMarshal.GetReference(destVectors)); + ref uint sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference((ReadOnlySpan)this.source.GetSpan())); + ref UnpackedRGBA destBaseAsUnpacked = ref Unsafe.As(ref MemoryMarshal.GetReference(this.destination.GetSpan())); ref Vector destBaseAsUInt = ref Unsafe.As>(ref destBaseAsUnpacked); ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsUnpacked); From 17f6dcc877f720848ca9d21a443c341c22bfaf87 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sat, 20 Oct 2018 22:49:31 +0200 Subject: [PATCH 13/22] Bulk conversion of arbitrary-sized Span-s of scalars --- .../Helpers/SimdUtils.BasicIntrinsics256.cs | 212 +++++++++++++++ .../Helpers/SimdUtils.ExtendedIntrinsics.cs | 90 +++++-- src/ImageSharp/Common/Helpers/SimdUtils.cs | 255 +++--------------- src/ImageSharp/Common/Tuples/Octet.cs | 100 +++++++ src/ImageSharp/Common/Tuples/Vector4Pair.cs | 2 +- .../JpegColorConverter.FromYCbCrSimd.cs | 2 +- .../JpegColorConverter.FromYCbCrSimdAvx2.cs | 2 +- .../ColorConverters/JpegColorConverter.cs | 2 +- .../PixelFormats/Rgba32.PixelOperations.cs | 12 +- .../Color/Bulk/ToVector4.cs | 8 +- .../General/Vectorization/UInt32ToSingle.cs | 27 +- .../Vectorization/WidenBytesToUInt32.cs | 7 +- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 108 +++++--- 13 files changed, 537 insertions(+), 290 deletions(-) create mode 100644 src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs create mode 100644 src/ImageSharp/Common/Tuples/Octet.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs new file mode 100644 index 000000000..e4dc1a1d8 --- /dev/null +++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs @@ -0,0 +1,212 @@ +// Copyright (c) Six Labors and contributors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using SixLabors.ImageSharp.Tuples; + +// ReSharper disable MemberHidesStaticFromOuterClass +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + /// + /// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*) + /// + public static class BasicIntrinsics256 + { + public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture; + + /// + /// as much elements as possible, slicing them down (keeping the remainder). + /// + internal static void BulkConvertByteToNormalizedFloatReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + if (IsAvailable) + { + int remainder = source.Length % 8; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertByteToNormalizedFloat( + source.Slice(0, alignedCount), + dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } + } + + /// + /// Convert 'source.Length' values normalized into [0..1] from 'source' + /// into 'dest' buffer of . The values are scaled up into [0-255] and rounded. + /// The implementation is SIMD optimized and works only with `source.Length` divisible by 8/>. + /// Based on: + /// + /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions + /// + /// + internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan source, Span dest) + { + GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); + + DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!"); + + if (source.Length == 0) + { + return; + } + + ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + int n = source.Length / 8; + + Vector magick = new Vector(32768.0f); + Vector scale = new Vector(255f) / new Vector(256f); + + // need to copy to a temporary struct, because + // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x) + // does not work. TODO: This might be a CoreClr bug, need to ask/report + var temp = default(Octet.OfUInt32); + ref Vector tempRef = ref Unsafe.As>(ref temp); + + for (int i = 0; i < n; i++) + { + // union { float f; uint32_t i; } u; + // u.f = 32768.0f + x * (255.0f / 256.0f); + // return (uint8_t)u.i; + Vector x = Unsafe.Add(ref srcBase, i); + x = (x * scale) + magick; + tempRef = x; + + ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); + d.LoadFrom(ref temp); + } + } + + /// + /// SIMD optimized implementation for . + /// Works only with `dest.Length` divisible by 8. + /// Implementation adapted from: + /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions + /// http://stackoverflow.com/a/536278 + /// + internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) + { + GuardAvx2(nameof(BulkConvertByteToNormalizedFloat)); + + DebugGuard.IsTrue((dest.Length % 8) == 0, nameof(source), "dest.Length should be divisable by 8!"); + + var bVec = new Vector(256.0f / 255.0f); + var magicFloat = new Vector(32768.0f); + var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f + var mask = new Vector(255); + + ref Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + + ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); + + int n = dest.Length / 8; + + for (int i = 0; i < n; i++) + { + ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); + ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i); + d.LoadFrom(ref s); + } + + for (int i = 0; i < n; i++) + { + ref Vector df = ref Unsafe.Add(ref destBaseAsFloat, i); + + var vi = Vector.AsVectorUInt32(df); + vi &= mask; + vi |= magicInt; + + var vf = Vector.AsVectorSingle(vi); + vf = (vf - magicFloat) * bVec; + + df = vf; + } + } + + /// + /// as much elements as possible, slicing them down (keeping the remainder). + /// + internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + if (IsAvailable) + { + int remainder = source.Length % Vector.Count; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } + } + + /// + /// Same as but clamps overflown values before conversion. + /// + internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) + { + GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows)); + + DebugGuard.IsTrue((source.Length % 8) == 0, nameof(source), "source.Length should be divisible by 8!"); + + if (source.Length == 0) + { + return; + } + + ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + int n = source.Length / 8; + + Vector magick = new Vector(32768.0f); + Vector scale = new Vector(255f) / new Vector(256f); + + // need to copy to a temporary struct, because + // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x) + // does not work. TODO: This might be a CoreClr bug, need to ask/report + var temp = default(Octet.OfUInt32); + ref Vector tempRef = ref Unsafe.As>(ref temp); + + for (int i = 0; i < n; i++) + { + // union { float f; uint32_t i; } u; + // u.f = 32768.0f + x * (255.0f / 256.0f); + // return (uint8_t)u.i; + Vector x = Unsafe.Add(ref srcBase, i); + x = Vector.Max(x, Vector.Zero); + x = Vector.Min(x, Vector.One); + + x = (x * scale) + magick; + tempRef = x; + + ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); + d.LoadFrom(ref temp); + } + } + } + } +} \ No newline at end of file diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index ec91e5098..5c0b8ee93 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -1,8 +1,10 @@ using System; +using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +// ReSharper disable MemberHidesStaticFromOuterClass namespace SixLabors.ImageSharp { internal static partial class SimdUtils @@ -18,22 +20,47 @@ namespace SixLabors.ImageSharp { public static bool IsAvailable { get; } = #if NETCOREAPP2_1 -// TODO: Also available in .NET 4.7.2, we need to add a build target! - true; + // TODO: Also available in .NET 4.7.2, we need to add a build target! + Vector.IsHardwareAccelerated; #else false; #endif /// - /// A variant of , which is faster on new .NET runtime. + /// as much elements as possible, slicing them down (keeping the remainder). + /// + [Conditional("NETCOREAPP2_1")] + internal static void BulkConvertByteToNormalizedFloatReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + if (IsAvailable) + { + int remainder = source.Length % Vector.Count; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertByteToNormalizedFloat(source.Slice(0, alignedCount), dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } + } + + /// + /// A variant of , which is faster on new RyuJIT runtime. /// // ReSharper disable once MemberHidesStaticFromOuterClass internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { - Guard.IsTrue( + DebugGuard.IsTrue( dest.Length % Vector.Count == 0, nameof(source), - "dest.Length should be divisable by Vector.Count!"); + "dest.Length should be divisible by Vector.Count!"); int n = dest.Length / Vector.Count; @@ -63,34 +90,52 @@ namespace SixLabors.ImageSharp } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector ConvertToSingle(Vector u, Vector scale) + /// + /// as much elements as possible, slicing them down (keeping the remainder). + /// + [Conditional("NETCOREAPP2_1")] + internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( + ref ReadOnlySpan source, + ref Span dest) { - Vector vi = Vector.AsVectorInt32(u); - Vector v = Vector.ConvertToSingle(vi); - v *= scale; - return v; + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + if (IsAvailable) + { + int remainder = source.Length % Vector.Count; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } } /// - /// A variant of , which is faster on new .NET runtime. + /// A variant of , which is faster on new .NET runtime. /// /// /// It does NOT worth yet to utilize this method (2018 Oct). /// See benchmark results for the "PackFromVector4_Rgba32" benchmark! /// TODO: Check again later! /// - // ReSharper disable once MemberHidesStaticFromOuterClass - internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) + internal static void BulkConvertNormalizedFloatToByteClampOverflows( + ReadOnlySpan source, + Span dest) { - Guard.IsTrue( + DebugGuard.IsTrue( dest.Length % Vector.Count == 0, nameof(dest), - "dest.Length should be divisable by Vector.Count!"); + "dest.Length should be divisible by Vector.Count!"); int n = dest.Length / Vector.Count; - ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Vector sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); for (int i = 0; i < n; i++) @@ -126,6 +171,15 @@ namespace SixLabors.ImageSharp Vector vi = Vector.ConvertToInt32(vf); return Vector.AsVectorUInt32(vi); } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector ConvertToSingle(Vector u, Vector scale) + { + Vector vi = Vector.AsVectorInt32(u); + Vector v = Vector.ConvertToSingle(vi); + v *= scale; + return v; + } } } -} +} \ No newline at end of file diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index 91aed8c79..73e9bacfa 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -6,6 +6,9 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using SixLabors.ImageSharp.PixelFormats; +using SixLabors.ImageSharp.Tuples; + namespace SixLabors.ImageSharp { /// @@ -16,7 +19,8 @@ namespace SixLabors.ImageSharp /// /// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte. /// - public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8; + public static bool IsAvx2CompatibleArchitecture { get; } = + Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8; internal static void GuardAvx2(string operation) { @@ -57,236 +61,61 @@ namespace SixLabors.ImageSharp } /// - /// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of values. - /// The values are scaled up into [0-255] and rounded. - /// The implementation is SIMD optimized and works only with `source.Length` divisible by . - /// Based on: - /// - /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions - /// - /// - internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan source, Span dest) - { - GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); - - DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!"); - - if (source.Length == 0) - { - return; - } - - ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); - ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); - int n = source.Length / 8; - - Vector magick = new Vector(32768.0f); - Vector scale = new Vector(255f) / new Vector(256f); - - // need to copy to a temporary struct, because - // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x) - // does not work. TODO: This might be a CoreClr bug, need to ask/report - var temp = default(Octet.OfUInt32); - ref Vector tempRef = ref Unsafe.As>(ref temp); - - for (int i = 0; i < n; i++) - { - // union { float f; uint32_t i; } u; - // u.f = 32768.0f + x * (255.0f / 256.0f); - // return (uint8_t)u.i; - Vector x = Unsafe.Add(ref srcBase, i); - x = (x * scale) + magick; - tempRef = x; - - ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); - d.LoadFrom(ref temp); - } - } - - /// - /// Converts `dest.Length` bytes to -s to -s normalized into [0..1] - /// The implementation is SIMD optimized and works only with `dest.Length` divisible by . - /// Implementation adapted from: - /// - /// http://stackoverflow.com/a/5362789 - /// + /// Converts `dest.Length` -s to -s normalized into [0..1]. + /// should be the of the same size as , + /// but there are no restrictions on the span's length. /// internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { - GuardAvx2(nameof(BulkConvertByteToNormalizedFloat)); - - DebugGuard.IsTrue((dest.Length % Vector.Count) == 0, nameof(source), "dest.Length should be divisable by Vector.Count!"); - - var bVec = new Vector(256.0f / 255.0f); - var magicFloat = new Vector(32768.0f); - var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f - var mask = new Vector(255); + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); - ref Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); - ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); - - ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); - - int n = dest.Length / 8; - - for (int i = 0; i < n; i++) - { - ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); - ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i); - d.LoadFrom(ref s); - } + ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); + BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); - for (int i = 0; i < n; i++) + // Deal with the remainder: + int count = source.Length; + if (count > 0) { - ref Vector df = ref Unsafe.Add(ref destBaseAsFloat, i); - - var vi = Vector.AsVectorUInt32(df); - vi &= mask; - vi |= magicInt; - - var vf = Vector.AsVectorSingle(vi); - vf = (vf - magicFloat) * bVec; - - df = vf; + // TODO: Do we need to optimize anything on this? (There are at most 7 remainders) + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref float dBase = ref MemoryMarshal.GetReference(dest); + for (int i = 0; i < count; i++) + { + Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, i) / 255f; + } } } /// - /// Same as but clamps overflown values before conversion. + /// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of . + /// The values are scaled up into [0-255] and rounded, overflows are clamped. + /// should be the of the same size as , + /// but there are no restrictions on the span's length. /// internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) { - GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows)); - - DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!"); - - if (source.Length == 0) - { - return; - } - - ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); - ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); - int n = source.Length / 8; - - Vector magick = new Vector(32768.0f); - Vector scale = new Vector(255f) / new Vector(256f); - - // need to copy to a temporary struct, because - // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x) - // does not work. TODO: This might be a CoreClr bug, need to ask/report - var temp = default(Octet.OfUInt32); - ref Vector tempRef = ref Unsafe.As>(ref temp); - - for (int i = 0; i < n; i++) - { - // union { float f; uint32_t i; } u; - // u.f = 32768.0f + x * (255.0f / 256.0f); - // return (uint8_t)u.i; - Vector x = Unsafe.Add(ref srcBase, i); - x = Vector.Max(x, Vector.Zero); - x = Vector.Min(x, Vector.One); - - x = (x * scale) + magick; - tempRef = x; - - ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); - d.LoadFrom(ref temp); - } - } - - // TODO: Replace these with T4-d library level tuples! - internal static class Octet - { - [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))] - public struct OfUInt32 - { - [FieldOffset(0 * sizeof(uint))] - public uint V0; - - [FieldOffset(1 * sizeof(uint))] - public uint V1; - - [FieldOffset(2 * sizeof(uint))] - public uint V2; - - [FieldOffset(3 * sizeof(uint))] - public uint V3; - - [FieldOffset(4 * sizeof(uint))] - public uint V4; - - [FieldOffset(5 * sizeof(uint))] - public uint V5; + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); - [FieldOffset(6 * sizeof(uint))] - public uint V6; + ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); + BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); - [FieldOffset(7 * sizeof(uint))] - public uint V7; - - public override string ToString() - { - return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; - } - - [MethodImpl(InliningOptions.ShortMethod)] - public void LoadFrom(ref OfByte src) - { - this.V0 = src.V0; - this.V1 = src.V1; - this.V2 = src.V2; - this.V3 = src.V3; - this.V4 = src.V4; - this.V5 = src.V5; - this.V6 = src.V6; - this.V7 = src.V7; - } - } - - [StructLayout(LayoutKind.Explicit, Size = 8)] - public struct OfByte + // Deal with the remainder: + int count = source.Length; + if (count > 0) { - [FieldOffset(0)] - public byte V0; - - [FieldOffset(1)] - public byte V1; - - [FieldOffset(2)] - public byte V2; - - [FieldOffset(3)] - public byte V3; - - [FieldOffset(4)] - public byte V4; - - [FieldOffset(5)] - public byte V5; - - [FieldOffset(6)] - public byte V6; - - [FieldOffset(7)] - public byte V7; - - public override string ToString() - { - return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; - } + ref float sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); - [MethodImpl(InliningOptions.ShortMethod)] - public void LoadFrom(ref OfUInt32 src) + for (int i = 0; i < count; i++) { - this.V0 = (byte)src.V0; - this.V1 = (byte)src.V1; - this.V2 = (byte)src.V2; - this.V3 = (byte)src.V3; - this.V4 = (byte)src.V4; - this.V5 = (byte)src.V5; - this.V6 = (byte)src.V6; - this.V7 = (byte)src.V7; + // TODO: Do we need to optimize anything on this? (There are at most 7 remainders) + float f = Unsafe.Add(ref sBase, i); + f *= 255f; + f += 0.5f; + f = MathF.Max(0, f); + f = MathF.Min(255f, f); + + Unsafe.Add(ref dBase, i) = (byte)f; } } } diff --git a/src/ImageSharp/Common/Tuples/Octet.cs b/src/ImageSharp/Common/Tuples/Octet.cs new file mode 100644 index 000000000..ae01a3121 --- /dev/null +++ b/src/ImageSharp/Common/Tuples/Octet.cs @@ -0,0 +1,100 @@ +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp.Tuples +{ + internal static class Octet + { + [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))] + public struct OfUInt32 + { + [FieldOffset(0 * sizeof(uint))] + public uint V0; + + [FieldOffset(1 * sizeof(uint))] + public uint V1; + + [FieldOffset(2 * sizeof(uint))] + public uint V2; + + [FieldOffset(3 * sizeof(uint))] + public uint V3; + + [FieldOffset(4 * sizeof(uint))] + public uint V4; + + [FieldOffset(5 * sizeof(uint))] + public uint V5; + + [FieldOffset(6 * sizeof(uint))] + public uint V6; + + [FieldOffset(7 * sizeof(uint))] + public uint V7; + + public override string ToString() + { + return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; + } + + [MethodImpl(InliningOptions.ShortMethod)] + public void LoadFrom(ref OfByte src) + { + this.V0 = src.V0; + this.V1 = src.V1; + this.V2 = src.V2; + this.V3 = src.V3; + this.V4 = src.V4; + this.V5 = src.V5; + this.V6 = src.V6; + this.V7 = src.V7; + } + } + + [StructLayout(LayoutKind.Explicit, Size = 8)] + public struct OfByte + { + [FieldOffset(0)] + public byte V0; + + [FieldOffset(1)] + public byte V1; + + [FieldOffset(2)] + public byte V2; + + [FieldOffset(3)] + public byte V3; + + [FieldOffset(4)] + public byte V4; + + [FieldOffset(5)] + public byte V5; + + [FieldOffset(6)] + public byte V6; + + [FieldOffset(7)] + public byte V7; + + public override string ToString() + { + return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; + } + + [MethodImpl(InliningOptions.ShortMethod)] + public void LoadFrom(ref OfUInt32 src) + { + this.V0 = (byte)src.V0; + this.V1 = (byte)src.V1; + this.V2 = (byte)src.V2; + this.V3 = (byte)src.V3; + this.V4 = (byte)src.V4; + this.V5 = (byte)src.V5; + this.V6 = (byte)src.V6; + this.V7 = (byte)src.V7; + } + } + } +} \ No newline at end of file diff --git a/src/ImageSharp/Common/Tuples/Vector4Pair.cs b/src/ImageSharp/Common/Tuples/Vector4Pair.cs index 309d5e2e5..5988b2200 100644 --- a/src/ImageSharp/Common/Tuples/Vector4Pair.cs +++ b/src/ImageSharp/Common/Tuples/Vector4Pair.cs @@ -2,7 +2,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -namespace SixLabors.ImageSharp.Common.Tuples +namespace SixLabors.ImageSharp.Tuples { /// /// Its faster to process multiple Vector4-s together, so let's pair them! diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs index 4b2626c58..5c63a478d 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs @@ -6,7 +6,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using SixLabors.ImageSharp.Common.Tuples; +using SixLabors.ImageSharp.Tuples; namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters { diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs index ab4947e65..3f26cdc90 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs @@ -6,7 +6,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using SixLabors.ImageSharp.Common.Tuples; +using SixLabors.ImageSharp.Tuples; // ReSharper disable ImpureMethodCallOnReadonlyValueField namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index 60abb7fb2..293f3bc1f 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -6,8 +6,8 @@ using System.Collections.Generic; using System.Linq; using System.Numerics; -using SixLabors.ImageSharp.Common.Tuples; using SixLabors.ImageSharp.Memory; +using SixLabors.ImageSharp.Tuples; using SixLabors.Memory; namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs index bfef60c60..564b93ef5 100644 --- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs @@ -37,7 +37,7 @@ namespace SixLabors.ImageSharp.PixelFormats } else { - ConvertToVector4UsingStandardIntrinsics(sourceColors, destinationVectors, count); + ConvertToVector4UsingBasicIntrinsics(sourceColors, destinationVectors, count); } } @@ -58,7 +58,7 @@ namespace SixLabors.ImageSharp.PixelFormats } else { - ConvertFromVector4StandardIntrinsics(sourceVectors, destinationColors, count); + ConvertFromVector4BasicIntrinsics(sourceVectors, destinationColors, count); } } @@ -112,7 +112,7 @@ namespace SixLabors.ImageSharp.PixelFormats } } - private static void ConvertToVector4UsingStandardIntrinsics( + private static void ConvertToVector4UsingBasicIntrinsics( ReadOnlySpan sourceColors, Span destinationVectors, int count) @@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.PixelFormats ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); - SimdUtils.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); + SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); } if (remainder > 0) @@ -155,7 +155,7 @@ namespace SixLabors.ImageSharp.PixelFormats } } - private static void ConvertFromVector4StandardIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count) + private static void ConvertFromVector4BasicIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count) { int remainder = count % 2; int alignedCount = count - remainder; @@ -165,7 +165,7 @@ namespace SixLabors.ImageSharp.PixelFormats ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount)); Span rawDest = MemoryMarshal.Cast(destinationColors); - SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); + SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); } if (remainder > 0) diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 726e214a9..855e9e4b9 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -30,8 +30,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Params( //64, //256, - //512, - 2048 + 512 + //1024 )] public int Count { get; set; } @@ -117,7 +117,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } - //[Benchmark] + [Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -159,7 +159,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - //[Benchmark] + [Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs index be19e719a..ca85a350c 100644 --- a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs +++ b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs @@ -5,6 +5,7 @@ using BenchmarkDotNet.Attributes; namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization { + [Config(typeof(Config.ShortClr))] public class UInt32ToSingle { private float[] data; @@ -66,8 +67,7 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization Unsafe.Add(ref bf, i) = v; } } - - // This code is not correct at all, it's just here as reference + [Benchmark] public void StandardSimdFromInt() { @@ -86,5 +86,28 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization Unsafe.Add(ref bf, i) = v; } } + + + [Benchmark] + public void StandardSimdFromInt_RefCast() + { + int n = Count / Vector.Count; + + ref Vector bf = ref Unsafe.As>(ref this.data[0]); + ref Vector bu = ref Unsafe.As, Vector>(ref bf); + + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) + { + ref Vector fRef = ref Unsafe.Add(ref bf, i); + + Vector du = Vector.AsVectorInt32(fRef); + Vector v = Vector.ConvertToSingle(du); + v *= scale; + + fRef = v; + } + } } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs index f71f6ec1b..2bc3af4c9 100644 --- a/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs +++ b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs @@ -3,8 +3,11 @@ using System.Runtime.CompilerServices; using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Tuples; + namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization { + [Config(typeof(Config.ShortClr))] public class WidenBytesToUInt32 { private byte[] source; @@ -25,8 +28,8 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization { const int N = Count / 8; - ref SimdUtils.Octet.OfByte sBase = ref Unsafe.As(ref this.source[0]); - ref SimdUtils.Octet.OfUInt32 dBase = ref Unsafe.As(ref this.dest[0]); + ref Octet.OfByte sBase = ref Unsafe.As(ref this.source[0]); + ref Octet.OfUInt32 dBase = ref Unsafe.As(ref this.dest[0]); for (int i = 0; i < N; i++) { diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 4e1717bda..2dcba2b74 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -62,7 +62,7 @@ namespace SixLabors.ImageSharp.Tests.Common { float[] data = new float[Vector.Count]; - var rnd = new Random(); + var rnd = new Random(seed); for (int i = 0; i < Vector.Count; i++) { @@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common [InlineData(1, 8)] [InlineData(2, 16)] [InlineData(3, 128)] - public void BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count) + public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count) { if (this.SkipOnNonAvx2()) { @@ -130,7 +130,7 @@ namespace SixLabors.ImageSharp.Tests.Common byte[] dest = new byte[count]; - SimdUtils.BulkConvertNormalizedFloatToByte(normalized, dest); + SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(normalized, dest); byte[] expected = orig.Select(f => (byte)(f)).ToArray(); @@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common [InlineData(1, 8)] [InlineData(2, 16)] [InlineData(3, 128)] - public void BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count) + public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count) { if (this.SkipOnNonAvx2()) { @@ -153,87 +153,113 @@ namespace SixLabors.ImageSharp.Tests.Common byte[] dest = new byte[count]; - SimdUtils.BulkConvertNormalizedFloatToByte(source, dest); + SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(source, dest); byte[] expected = source.Select(f => (byte)Math.Round(f * 255f)).ToArray(); Assert.Equal(expected, dest); } + public static readonly TheoryData ArraySizesDivisibleBy8 = new TheoryData { 0, 8, 16, 1024 }; + + public static readonly TheoryData ArraySizesDivisibleBy32 = new TheoryData { 0, 32, 512 }; + + public static readonly TheoryData ArbitraryArraySizes = + new TheoryData + { + 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520, + }; [Theory] - [InlineData(1, 0)] - [InlineData(2, 32)] - [InlineData(3, 128)] - public void BulkConvertByteToNormalizedFloat(int seed, int count) + [MemberData(nameof(ArraySizesDivisibleBy8))] + public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count) { if (this.SkipOnNonAvx2()) { return; } - byte[] source = new Random(seed).GenerateRandomByteArray(count); - float[] result = new float[count]; - float[] expected = source.Select(b => (float)b / 255f).ToArray(); - - SimdUtils.BulkConvertByteToNormalizedFloat(source, result); - - Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); + TestImpl_BulkConvertByteToNormalizedFloat( + count, + (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(s.Span, d.Span)); } [Theory] - [InlineData(1, 0)] - [InlineData(2, 32)] - [InlineData(3, 128)] - public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count) + [MemberData(nameof(ArraySizesDivisibleBy32))] + public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int count) + { + TestImpl_BulkConvertByteToNormalizedFloat( + count, + (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(s.Span, d.Span)); + } + + [Theory] + [MemberData(nameof(ArbitraryArraySizes))] + public void BulkConvertByteToNormalizedFloat(int count) + { + TestImpl_BulkConvertByteToNormalizedFloat( + count, + (s, d) => SimdUtils.BulkConvertByteToNormalizedFloat(s.Span, d.Span)); + } + + private static void TestImpl_BulkConvertByteToNormalizedFloat( + int count, + Action, Memory> convert) { - byte[] source = new Random(seed).GenerateRandomByteArray(count); + byte[] source = new Random(count).GenerateRandomByteArray(count); float[] result = new float[count]; float[] expected = source.Select(b => (float)b / 255f).ToArray(); - - SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result); + convert(source, result); Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); } - - public static readonly TheoryData BulkConvertNormalizedFloatToByteClampOverflows_Data = - new TheoryData - { - 0, 64, 1024 - }; - [Theory] - [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))] - public void BulkConvertNormalizedFloatToByteClampOverflows(int count) + [MemberData(nameof(ArraySizesDivisibleBy8))] + public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) { if (this.SkipOnNonAvx2()) { return; } - float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f); - byte[] expected = source.Select(NormalizedFloatToByte).ToArray(); - byte[] actual = new byte[count]; - - SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(source, actual); - - Assert.Equal(expected, actual); + TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count, + (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span) + ); } [Theory] - [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))] + [MemberData(nameof(ArraySizesDivisibleBy32))] public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) + { + TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count, + (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span) + ); + } + + [Theory] + [MemberData(nameof(ArbitraryArraySizes))] + public void BulkConvertNormalizedFloatToByteClampOverflows(int count) + { + TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count, + (s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span) + ); + } + + private static void TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( + int count, + Action, Memory> convert) { float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f); byte[] expected = source.Select(NormalizedFloatToByte).ToArray(); byte[] actual = new byte[count]; - SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(source, actual); + convert(source, actual); Assert.Equal(expected, actual); } + private static byte NormalizedFloatToByte(float f) => (byte)Math.Min(255f, Math.Max(0f, f * 255f + 0.5f)); [Theory] From 34ab918624f802989629402d0825a28aca82a634 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sat, 20 Oct 2018 23:31:21 +0200 Subject: [PATCH 14/22] fix benchmarks --- .../Color/Bulk/PackFromVector4.cs | 4 +- .../Color/Bulk/ToVector4.cs | 73 ++----------------- 2 files changed, 8 insertions(+), 69 deletions(-) diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index 1153d8f40..eb7154955 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -95,12 +95,12 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } [Benchmark(Baseline = true)] - public void BulkConvertNormalizedFloatToByteClampOverflows() + public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); + SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); } [Benchmark] diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 855e9e4b9..c50c7ce5a 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -30,8 +30,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Params( //64, //256, - 512 - //1024 + //512, + //1024, + 2048 )] public int Count { get; set; } @@ -100,12 +101,12 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } [Benchmark(Baseline = true)] - public void BulkConvertByteToNormalizedFloat() + public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - SimdUtils.BulkConvertByteToNormalizedFloat(sBytes, dFloats); + SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } [Benchmark] @@ -117,7 +118,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } - [Benchmark] + //[Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -200,67 +201,5 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk v *= scale; return v; } - - //[Benchmark] - public void OldImplementation() - { - int count = this.Count; - var bVec = new Vector(256.0f / 255.0f); - var magicFloat = new Vector(32768.0f); - var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f - var mask = new Vector(255); - - int unpackedRawCount = count * 4; - - ref uint sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference((ReadOnlySpan)this.source.GetSpan())); - ref UnpackedRGBA destBaseAsUnpacked = ref Unsafe.As(ref MemoryMarshal.GetReference(this.destination.GetSpan())); - ref Vector destBaseAsUInt = ref Unsafe.As>(ref destBaseAsUnpacked); - ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsUnpacked); - - for (int i = 0; i < count; i++) - { - uint sVal = Unsafe.Add(ref sourceBase, i); - ref UnpackedRGBA dst = ref Unsafe.Add(ref destBaseAsUnpacked, i); - - // This call is the bottleneck now: - dst.Load(sVal); - } - - int numOfVectors = unpackedRawCount / Vector.Count; - - for (int i = 0; i < numOfVectors; i++) - { - Vector vi = Unsafe.Add(ref destBaseAsUInt, i); - - vi &= mask; - vi |= magicInt; - - var vf = Vector.AsVectorSingle(vi); - vf = (vf - magicFloat) * bVec; - - Unsafe.Add(ref destBaseAsFloat, i) = vf; - } - } - - [StructLayout(LayoutKind.Sequential)] - private struct UnpackedRGBA - { - private uint r; - - private uint g; - - private uint b; - - private uint a; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void Load(uint p) - { - this.r = p; - this.g = p >> 8; - this.b = p >> 16; - this.a = p >> 24; - } - } } } \ No newline at end of file From 2fcda3cee0d4091678bf4a41bfcfa2b88a444949 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sun, 21 Oct 2018 01:21:36 +0200 Subject: [PATCH 15/22] simplify Rgba32.PixelOperations, include benchmark results --- .../PixelFormats/PixelOperations{TPixel}.cs | 60 ++++----- .../PixelFormats/Rgba32.PixelOperations.cs | 123 ++---------------- .../Color/Bulk/PackFromVector4.cs | 41 ++++-- .../Color/Bulk/ToVector4.cs | 32 ++++- 4 files changed, 94 insertions(+), 162 deletions(-) diff --git a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs index 39c442fe0..cbf164a71 100644 --- a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs +++ b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs @@ -29,7 +29,19 @@ namespace SixLabors.ImageSharp.PixelFormats /// The number of pixels to convert. internal virtual void PackFromVector4(ReadOnlySpan sourceVectors, Span destinationColors, int count) { - PackFromVector4Common(sourceVectors, destinationColors, count); + ReadOnlySpan sourceVectors1 = sourceVectors; + Span destinationColors1 = destinationColors; + GuardSpans(sourceVectors1, nameof(sourceVectors1), destinationColors1, nameof(destinationColors1), count); + + ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors1); + ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors1); + + for (int i = 0; i < count; i++) + { + ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i); + ref TPixel dp = ref Unsafe.Add(ref destRef, i); + dp.PackFromVector4(sp); + } } /// @@ -40,7 +52,19 @@ namespace SixLabors.ImageSharp.PixelFormats /// The number of pixels to convert. internal virtual void ToVector4(ReadOnlySpan sourceColors, Span destinationVectors, int count) { - ToVector4Common(sourceColors, destinationVectors, count); + ReadOnlySpan sourceColors1 = sourceColors; + Span destinationVectors1 = destinationVectors; + GuardSpans(sourceColors1, nameof(sourceColors1), destinationVectors1, nameof(destinationVectors1), count); + + ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors1); + ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors1); + + for (int i = 0; i < count; i++) + { + ref TPixel sp = ref Unsafe.Add(ref sourceRef, i); + ref Vector4 dp = ref Unsafe.Add(ref destRef, i); + dp = sp.ToVector4(); + } } /// @@ -106,37 +130,5 @@ namespace SixLabors.ImageSharp.PixelFormats Guard.MustBeSizedAtLeast(source, minLength, sourceParamName); Guard.MustBeSizedAtLeast(destination, minLength, destinationParamName); } - - [MethodImpl(InliningOptions.ShortMethod)] - internal static void PackFromVector4Common(ReadOnlySpan sourceVectors, Span destinationColors, int count) - { - GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count); - - ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors); - ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors); - - for (int i = 0; i < count; i++) - { - ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i); - ref TPixel dp = ref Unsafe.Add(ref destRef, i); - dp.PackFromVector4(sp); - } - } - - [MethodImpl(InliningOptions.ShortMethod)] - internal static void ToVector4Common(ReadOnlySpan sourceColors, Span destinationVectors, int count) - { - GuardSpans(sourceColors, nameof(sourceColors), destinationVectors, nameof(destinationVectors), count); - - ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors); - ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors); - - for (int i = 0; i < count; i++) - { - ref TPixel sp = ref Unsafe.Add(ref sourceRef, i); - ref Vector4 dp = ref Unsafe.Add(ref destRef, i); - dp = sp.ToVector4(); - } - } } } \ No newline at end of file diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs index 564b93ef5..bb42ec7e3 100644 --- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs @@ -24,21 +24,12 @@ namespace SixLabors.ImageSharp.PixelFormats Guard.MustBeSizedAtLeast(sourceColors, count, nameof(sourceColors)); Guard.MustBeSizedAtLeast(destinationVectors, count, nameof(destinationVectors)); - if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) - { - // Doesn't worth to bother with SIMD: - ToVector4Common(sourceColors, destinationVectors, count); - return; - } + sourceColors = sourceColors.Slice(0, count); + destinationVectors = destinationVectors.Slice(0, count); - if (SimdUtils.ExtendedIntrinsics.IsAvailable) - { - ConvertToVector4UsingExtendedIntrinsics(sourceColors, destinationVectors, count); - } - else - { - ConvertToVector4UsingBasicIntrinsics(sourceColors, destinationVectors, count); - } + SimdUtils.BulkConvertByteToNormalizedFloat( + MemoryMarshal.Cast(sourceColors), + MemoryMarshal.Cast(destinationVectors)); } /// @@ -46,20 +37,12 @@ namespace SixLabors.ImageSharp.PixelFormats { GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count); - if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) - { - PackFromVector4Common(sourceVectors, destinationColors, count); - return; - } + sourceVectors = sourceVectors.Slice(0, count); + destinationColors = destinationColors.Slice(0, count); - if (SimdUtils.ExtendedIntrinsics.IsAvailable) - { - ConvertFromVector4ExtendedIntrinsics(sourceVectors, destinationColors, count); - } - else - { - ConvertFromVector4BasicIntrinsics(sourceVectors, destinationColors, count); - } + SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows( + MemoryMarshal.Cast(sourceVectors), + MemoryMarshal.Cast(destinationColors)); } /// @@ -89,92 +72,6 @@ namespace SixLabors.ImageSharp.PixelFormats sourcePixels.Slice(0, count).CopyTo(dest); } - - private static void ConvertToVector4UsingExtendedIntrinsics( - ReadOnlySpan sourceColors, - Span destinationVectors, - int count) - { - int remainder = count % 8; - int alignedCount = count - remainder; - - if (alignedCount > 0) - { - ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); - Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); - - SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); - } - - if (remainder > 0) - { - ToVector4Common(sourceColors.Slice(alignedCount), destinationVectors.Slice(alignedCount), remainder); - } - } - - private static void ConvertToVector4UsingBasicIntrinsics( - ReadOnlySpan sourceColors, - Span destinationVectors, - int count) - { - int remainder = count % 2; - int alignedCount = count - remainder; - - if (alignedCount > 0) - { - ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); - Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); - - SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); - } - - if (remainder > 0) - { - // actually: remainder == 1 - int lastIdx = count - 1; - destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4(); - } - } - - private static void ConvertFromVector4ExtendedIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count) - { - int remainder = count % 8; - int alignedCount = count - remainder; - - if (alignedCount > 0) - { - ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors); - Span rawDest = MemoryMarshal.Cast(destinationColors.Slice(0, alignedCount)); - - SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); - } - - if (remainder > 0) - { - PackFromVector4Common(sourceVectors.Slice(alignedCount), destinationColors.Slice(alignedCount), remainder); - } - } - - private static void ConvertFromVector4BasicIntrinsics(ReadOnlySpan sourceVectors, Span destinationColors, int count) - { - int remainder = count % 2; - int alignedCount = count - remainder; - - if (alignedCount > 0) - { - ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount)); - Span rawDest = MemoryMarshal.Cast(destinationColors); - - SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); - } - - if (remainder > 0) - { - // actually: remainder == 1 - int lastIdx = count - 1; - destinationColors[lastIdx].PackFromVector4(sourceVectors[lastIdx]); - } - } } } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index eb7154955..7a212b052 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -25,7 +25,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk protected IMemoryOwner destination; [Params( - //64, + 64, 2048 )] public int Count { get; set; } @@ -72,7 +72,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk public class PackFromVector4_Rgba32 : PackFromVector4 { [Benchmark] - public void FastDefault() + public void BasicBulk() { ref Vector4 sBase = ref this.source.GetSpan()[0]; ref Rgba32 dBase = ref this.destination.GetSpan()[0]; @@ -112,16 +112,31 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); } - // RESULTS: - // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Allocated | - // ----------------------------------------------------------------- |-------- |------ |----------:|----------:|----------:|-------:|---------:|----------:| - // FastDefault | Clr | 2048 | 15.989 us | 6.1384 us | 0.3468 us | 4.07 | 0.08 | 0 B | - // BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3.931 us | 0.6264 us | 0.0354 us | 1.00 | 0.00 | 0 B | - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 2.100 us | 0.4717 us | 0.0267 us | 0.53 | 0.01 | 0 B | - // - // | | | | | | | | | - // FastDefault | Core | 2048 | 14.693 us | 0.5131 us | 0.0290 us | 3.76 | 0.03 | 0 B | - // BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 3.913 us | 0.5661 us | 0.0320 us | 1.00 | 0.00 | 0 B | - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 1.966 us | 0.4056 us | 0.0229 us | 0.50 | 0.01 | 0 B | + // RESULTS (2018 October): + // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated | + // ------------------------------------------------------------------ |-------- |------ |-------------:|-------------:|-----------:|-------:|---------:|-------:|----------:| + // BasicBulk | Clr | 64 | 581.62 ns | 33.625 ns | 1.8999 ns | 2.27 | 0.02 | - | 0 B | + // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 64 | 256.66 ns | 45.153 ns | 2.5512 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 64 | 201.92 ns | 30.161 ns | 1.7042 ns | 0.79 | 0.01 | - | 0 B | + // PixelOperations_Base | Clr | 64 | 665.01 ns | 13.032 ns | 0.7363 ns | 2.59 | 0.02 | 0.0067 | 24 B | + // PixelOperations_Specialized | Clr | 64 | 295.14 ns | 26.335 ns | 1.4880 ns | 1.15 | 0.01 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Core | 64 | 513.22 ns | 91.110 ns | 5.1479 ns | 3.19 | 0.03 | - | 0 B | + // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Core | 64 | 160.76 ns | 2.760 ns | 0.1559 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 64 | 95.98 ns | 10.077 ns | 0.5694 ns | 0.60 | 0.00 | - | 0 B | + // PixelOperations_Base | Core | 64 | 591.74 ns | 49.856 ns | 2.8170 ns | 3.68 | 0.01 | 0.0067 | 24 B | + // PixelOperations_Specialized | Core | 64 | 149.11 ns | 4.485 ns | 0.2534 ns | 0.93 | 0.00 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Clr | 2048 | 15,345.85 ns | 1,213.551 ns | 68.5679 ns | 3.90 | 0.01 | - | 0 B | + // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3,939.49 ns | 71.101 ns | 4.0173 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 2,272.61 ns | 110.671 ns | 6.2531 ns | 0.58 | 0.00 | - | 0 B | + // PixelOperations_Base | Clr | 2048 | 17,422.47 ns | 811.733 ns | 45.8644 ns | 4.42 | 0.01 | - | 24 B | + // PixelOperations_Specialized | Clr | 2048 | 3,984.26 ns | 110.352 ns | 6.2351 ns | 1.01 | 0.00 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Core | 2048 | 14,950.43 ns | 699.309 ns | 39.5123 ns | 3.76 | 0.02 | - | 0 B | + // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 3,978.28 ns | 481.105 ns | 27.1833 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 2,169.54 ns | 75.606 ns | 4.2719 ns | !!0.55!| 0.00 | - | 0 B | + // PixelOperations_Base | Core | 2048 | 18,403.62 ns | 1,494.056 ns | 84.4169 ns | 4.63 | 0.03 | - | 24 B | + // PixelOperations_Specialized | Core | 2048 | 2,227.60 ns | 486.761 ns | 27.5029 ns | !!0.56!| 0.01 | - | 0 B | } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index c50c7ce5a..4a801d64e 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -28,7 +28,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk protected IMemoryOwner destination; [Params( - //64, + 64, //256, //512, //1024, @@ -160,7 +160,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - [Benchmark] + //[Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -201,5 +201,33 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk v *= scale; return v; } + + // RESULTS (2018 October): + // + // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated | + // ---------------------------------------------------- |-------- |------ |------------:|-------------:|-----------:|-------:|---------:|-------:|----------:| + // BasicBulk | Clr | 64 | 267.40 ns | 30.711 ns | 1.7352 ns | 1.07 | 0.01 | - | 0 B | + // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Clr | 64 | 249.97 ns | 33.838 ns | 1.9119 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Clr | 64 | 176.97 ns | 5.221 ns | 0.2950 ns | 0.71 | 0.00 | - | 0 B | + // PixelOperations_Base | Clr | 64 | 349.70 ns | 104.331 ns | 5.8949 ns | 1.40 | 0.02 | 0.0072 | 24 B | + // PixelOperations_Specialized | Clr | 64 | 288.31 ns | 26.833 ns | 1.5161 ns | 1.15 | 0.01 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Core | 64 | 185.36 ns | 30.051 ns | 1.6979 ns | 1.26 | 0.01 | - | 0 B | + // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Core | 64 | 146.84 ns | 12.674 ns | 0.7161 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Core | 64 | 67.31 ns | 2.542 ns | 0.1436 ns | 0.46 | 0.00 | - | 0 B | + // PixelOperations_Base | Core | 64 | 272.03 ns | 94.419 ns | 5.3348 ns | 1.85 | 0.03 | 0.0072 | 24 B | + // PixelOperations_Specialized | Core | 64 | 121.91 ns | 31.477 ns | 1.7785 ns | 0.83 | 0.01 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Clr | 2048 | 5,133.04 ns | 284.052 ns | 16.0494 ns | 1.21 | 0.01 | - | 0 B | + // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Clr | 2048 | 4,248.58 ns | 1,095.887 ns | 61.9196 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Clr | 2048 | 1,214.02 ns | 184.349 ns | 10.4160 ns | 0.29 | 0.00 | - | 0 B | + // PixelOperations_Base | Clr | 2048 | 7,096.04 ns | 362.350 ns | 20.4734 ns | 1.67 | 0.02 | - | 24 B | + // PixelOperations_Specialized | Clr | 2048 | 4,314.19 ns | 204.964 ns | 11.5809 ns | 1.02 | 0.01 | - | 0 B | + // | | | | | | | | | | + // BasicBulk | Core | 2048 | 5,038.38 ns | 223.282 ns | 12.6158 ns | 1.20 | 0.01 | - | 0 B | + // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Core | 2048 | 4,199.17 ns | 897.985 ns | 50.7378 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Core | 2048 | 1,113.86 ns | 64.799 ns | 3.6613 ns | !!0.27!| 0.00 | - | 0 B | + // PixelOperations_Base | Core | 2048 | 7,015.00 ns | 920.083 ns | 51.9864 ns | 1.67 | 0.02 | - | 24 B | + // PixelOperations_Specialized | Core | 2048 | 1,176.59 ns | 256.955 ns | 14.5184 ns | !!0.28!| 0.00 | - | 0 B | } } \ No newline at end of file From cb8b48dcbaf0e4fc3f5d7402b7b488f1c9a0ce3d Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sun, 21 Oct 2018 01:31:55 +0200 Subject: [PATCH 16/22] cleanup code and comments --- .../Helpers/SimdUtils.BasicIntrinsics256.cs | 96 +++++++++---------- .../Helpers/SimdUtils.ExtendedIntrinsics.cs | 10 +- src/ImageSharp/Common/Helpers/SimdUtils.cs | 28 +++--- 3 files changed, 67 insertions(+), 67 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs index e4dc1a1d8..a8b343498 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs @@ -46,53 +46,6 @@ namespace SixLabors.ImageSharp } } - /// - /// Convert 'source.Length' values normalized into [0..1] from 'source' - /// into 'dest' buffer of . The values are scaled up into [0-255] and rounded. - /// The implementation is SIMD optimized and works only with `source.Length` divisible by 8/>. - /// Based on: - /// - /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions - /// - /// - internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan source, Span dest) - { - GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); - - DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!"); - - if (source.Length == 0) - { - return; - } - - ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); - ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); - int n = source.Length / 8; - - Vector magick = new Vector(32768.0f); - Vector scale = new Vector(255f) / new Vector(256f); - - // need to copy to a temporary struct, because - // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x) - // does not work. TODO: This might be a CoreClr bug, need to ask/report - var temp = default(Octet.OfUInt32); - ref Vector tempRef = ref Unsafe.As>(ref temp); - - for (int i = 0; i < n; i++) - { - // union { float f; uint32_t i; } u; - // u.f = 32768.0f + x * (255.0f / 256.0f); - // return (uint8_t)u.i; - Vector x = Unsafe.Add(ref srcBase, i); - x = (x * scale) + magick; - tempRef = x; - - ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); - d.LoadFrom(ref temp); - } - } - /// /// SIMD optimized implementation for . /// Works only with `dest.Length` divisible by 8. @@ -165,7 +118,7 @@ namespace SixLabors.ImageSharp } /// - /// Same as but clamps overflown values before conversion. + /// Implementation of which is faster on older runtimes. /// internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) { @@ -207,6 +160,53 @@ namespace SixLabors.ImageSharp d.LoadFrom(ref temp); } } + + /// + /// Convert 'source.Length' values normalized into [0..1] from 'source' + /// into 'dest' buffer of . The values are scaled up into [0-255] and rounded. + /// The implementation is SIMD optimized and works only with `source.Length` divisible by 8. + /// Based on: + /// + /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions + /// + /// + internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan source, Span dest) + { + GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); + + DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!"); + + if (source.Length == 0) + { + return; + } + + ref Vector srcBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Octet.OfByte destBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + int n = source.Length / 8; + + Vector magick = new Vector(32768.0f); + Vector scale = new Vector(255f) / new Vector(256f); + + // need to copy to a temporary struct, because + // SimdUtils.Octet.OfUInt32 temp = Unsafe.As, SimdUtils.Octet.OfUInt32>(ref x) + // does not work. TODO: This might be a CoreClr bug, need to ask/report + var temp = default(Octet.OfUInt32); + ref Vector tempRef = ref Unsafe.As>(ref temp); + + for (int i = 0; i < n; i++) + { + // union { float f; uint32_t i; } u; + // u.f = 32768.0f + x * (255.0f / 256.0f); + // return (uint8_t)u.i; + Vector x = Unsafe.Add(ref srcBase, i); + x = (x * scale) + magick; + tempRef = x; + + ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i); + d.LoadFrom(ref temp); + } + } } } } \ No newline at end of file diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index 5c0b8ee93..fd263b54c 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -52,9 +52,8 @@ namespace SixLabors.ImageSharp } /// - /// A variant of , which is faster on new RyuJIT runtime. + /// Implementation , which is faster on new RyuJIT runtime. /// - // ReSharper disable once MemberHidesStaticFromOuterClass internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { DebugGuard.IsTrue( @@ -116,13 +115,8 @@ namespace SixLabors.ImageSharp } /// - /// A variant of , which is faster on new .NET runtime. + /// Implementation of , which is faster on new .NET runtime. /// - /// - /// It does NOT worth yet to utilize this method (2018 Oct). - /// See benchmark results for the "PackFromVector4_Rgba32" benchmark! - /// TODO: Check again later! - /// internal static void BulkConvertNormalizedFloatToByteClampOverflows( ReadOnlySpan source, Span dest) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index 73e9bacfa..111ac2240 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -22,17 +22,10 @@ namespace SixLabors.ImageSharp public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8; - internal static void GuardAvx2(string operation) - { - if (!IsAvx2CompatibleArchitecture) - { - throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!"); - } - } - /// /// Transform all scalars in 'v' in a way that converting them to would have rounding semantics. /// + /// The vector [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static Vector4 PseudoRound(this Vector4 v) { @@ -48,14 +41,15 @@ namespace SixLabors.ImageSharp /// https://github.com/g-truc/glm/blob/master/glm/simd/common.h#L110 /// /// + /// The vector [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static Vector FastRound(this Vector x) + internal static Vector FastRound(this Vector v) { Vector magic0 = new Vector(int.MinValue); // 0x80000000 Vector sgn0 = Vector.AsVectorSingle(magic0); - Vector and0 = Vector.BitwiseAnd(sgn0, x); + Vector and0 = Vector.BitwiseAnd(sgn0, v); Vector or0 = Vector.BitwiseOr(and0, new Vector(8388608.0f)); - Vector add0 = Vector.Add(x, or0); + Vector add0 = Vector.Add(v, or0); Vector sub0 = Vector.Subtract(add0, or0); return sub0; } @@ -65,6 +59,8 @@ namespace SixLabors.ImageSharp /// should be the of the same size as , /// but there are no restrictions on the span's length. /// + /// The source span of bytes + /// The destination span of floats internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); @@ -92,6 +88,8 @@ namespace SixLabors.ImageSharp /// should be the of the same size as , /// but there are no restrictions on the span's length. /// + /// The source span of floats + /// The destination span of bytes internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); @@ -119,5 +117,13 @@ namespace SixLabors.ImageSharp } } } + + private static void GuardAvx2(string operation) + { + if (!IsAvx2CompatibleArchitecture) + { + throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!"); + } + } } } \ No newline at end of file From d1d52a713336fd3e411777044cdbd474c245a3b8 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sun, 21 Oct 2018 17:29:34 +0200 Subject: [PATCH 17/22] FallbackIntrinsics128 + ImageMaths.Modulo* implementations --- src/ImageSharp/Common/Helpers/ImageMaths.cs | 16 ++ .../Helpers/SimdUtils.BasicIntrinsics256.cs | 2 +- .../Helpers/SimdUtils.ExtendedIntrinsics.cs | 5 +- .../SimdUtils.FallbackIntrinsics128.cs | 143 ++++++++++++++++++ src/ImageSharp/Common/Helpers/SimdUtils.cs | 6 +- .../Color/Bulk/PackFromVector4.cs | 26 +--- .../Color/Bulk/ToVector4.cs | 25 +-- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 27 +++- .../Helpers/ImageMathsTests.cs | 54 +++++++ 9 files changed, 256 insertions(+), 48 deletions(-) create mode 100644 src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs index 35769d96a..e4fd9bce6 100644 --- a/src/ImageSharp/Common/Helpers/ImageMaths.cs +++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs @@ -39,6 +39,22 @@ namespace SixLabors.ImageSharp return (a / GreatestCommonDivisor(a, b)) * b; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int Modulo4(int a) => a & 3; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int Modulo8(int a) => a & 7; + + /// + /// Fast (mod m) calculator, + /// where should be a power of 2. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int ModuloP2(int a, int m) + { + return a & (m - 1); + } + /// /// Returns the absolute value of a 32-bit signed integer. Uses bit shifting to speed up the operation. /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs index a8b343498..c7fd21a8f 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs @@ -14,7 +14,7 @@ namespace SixLabors.ImageSharp internal static partial class SimdUtils { /// - /// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*) + /// Implementation with 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen etc.) /// public static class BasicIntrinsics256 { diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index fd263b54c..996a08fb4 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -10,8 +10,9 @@ namespace SixLabors.ImageSharp internal static partial class SimdUtils { /// - /// Methods accelerated only in RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+) - /// PR: + /// Implementation methods based on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*). + /// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+) + /// See: /// https://github.com/dotnet/coreclr/pull/10662 /// API Proposal: /// https://github.com/dotnet/corefx/issues/15957 diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs new file mode 100644 index 000000000..bb2147466 --- /dev/null +++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs @@ -0,0 +1,143 @@ +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + /// + /// Fallback implementation based on (128bit). + /// For , efficient software fallback implementations are present + /// + maybe even mono can emit intrinsics for that type :P + /// + public static class FallbackIntrinsics128 + { + /// + /// as much elements as possible, slicing them down (keeping the remainder). + /// + internal static void BulkConvertByteToNormalizedFloatReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + int remainder = source.Length % 4; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertByteToNormalizedFloat( + source.Slice(0, alignedCount), + dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } + + /// + /// as much elements as possible, slicing them down (keeping the remainder). + /// + internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + int remainder = source.Length % 4; + int alignedCount = source.Length - remainder; + + if (alignedCount > 0) + { + BulkConvertNormalizedFloatToByteClampOverflows( + source.Slice(0, alignedCount), + dest.Slice(0, alignedCount)); + + source = source.Slice(alignedCount); + dest = dest.Slice(alignedCount); + } + } + + /// + /// Implementation of using . + /// + internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) + { + DebugGuard.IsTrue((dest.Length % 4) == 0, nameof(dest), "dest.Length should be divisible by 4!"); + + int count = dest.Length / 4; + if (count == 0) + { + return; + } + + ref ByteVector4 sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref Vector4 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + + const float Scale = 1f / 255f; + Vector4 d = default; + + for (int i = 0; i < count; i++) + { + ref ByteVector4 s = ref Unsafe.Add(ref sBase, i); + d.X = s.X; + d.Y = s.Y; + d.Z = s.Z; + d.W = s.W; + d *= Scale; + Unsafe.Add(ref dBase, i) = d; + } + } + + /// + /// Implementation of using . + /// + internal static void BulkConvertNormalizedFloatToByteClampOverflows( + ReadOnlySpan source, + Span dest) + { + DebugGuard.IsTrue((source.Length % 4) == 0, nameof(source), "source.Length should be divisible by 4!"); + + int count = source.Length / 4; + if (count == 0) + { + return; + } + + ref Vector4 sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref ByteVector4 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + + var half = new Vector4(0.5f); + var maxBytes = new Vector4(255f); + + for (int i = 0; i < count; i++) + { + Vector4 s = Unsafe.Add(ref sBase, i); + s *= maxBytes; + s += half; + + // I'm not sure if Clamp() is properly implemented with intrinsics. + s = Vector4.Max(Vector4.Zero, s); + s = Vector4.Min(maxBytes, s); + + ref ByteVector4 d = ref Unsafe.Add(ref dBase, i); + d.X = (byte)s.X; + d.Y = (byte)s.Y; + d.Z = (byte)s.Z; + d.W = (byte)s.W; + } + } + + [StructLayout(LayoutKind.Sequential)] + private struct ByteVector4 + { + public byte X; + public byte Y; + public byte Z; + public byte W; + } + } + } +} \ No newline at end of file diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index 111ac2240..bc75dc8ca 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -55,7 +55,7 @@ namespace SixLabors.ImageSharp } /// - /// Converts `dest.Length` -s to -s normalized into [0..1]. + /// Converts all input -s to -s normalized into [0..1]. /// should be the of the same size as , /// but there are no restrictions on the span's length. /// @@ -67,6 +67,7 @@ namespace SixLabors.ImageSharp ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); + FallbackIntrinsics128.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); // Deal with the remainder: int count = source.Length; @@ -83,7 +84,7 @@ namespace SixLabors.ImageSharp } /// - /// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of . + /// Convert all values normalized into [0..1] from 'source' into 'dest' buffer of . /// The values are scaled up into [0-255] and rounded, overflows are clamped. /// should be the of the same size as , /// but there are no restrictions on the span's length. @@ -96,6 +97,7 @@ namespace SixLabors.ImageSharp ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); + FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); // Deal with the remainder: int count = source.Length; diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index 7a212b052..a56082fcd 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -72,30 +72,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk public class PackFromVector4_Rgba32 : PackFromVector4 { [Benchmark] - public void BasicBulk() + public void FallbackIntrinsics128() { - ref Vector4 sBase = ref this.source.GetSpan()[0]; - ref Rgba32 dBase = ref this.destination.GetSpan()[0]; - - Vector4 maxBytes = new Vector4(255); - Vector4 half = new Vector4(0.5f); + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - for (int i = 0; i < this.Count; i++) - { - Vector4 v = Unsafe.Add(ref sBase, i); - v *= maxBytes; - v += half; - v = Vector4.Clamp(v, Vector4.Zero, maxBytes); - ref Rgba32 d = ref Unsafe.Add(ref dBase, i); - d.R = (byte)v.X; - d.G = (byte)v.Y; - d.B = (byte)v.Z; - d.A = (byte)v.W; - } + SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); } [Benchmark(Baseline = true)] - public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows() + public void BasicIntrinsics256() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); @@ -104,7 +90,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } [Benchmark] - public void ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows() + public void ExtendedIntrinsic() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 4a801d64e..519edaa31 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -79,29 +79,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk public class ToVector4_Rgba32 : ToVector4 { [Benchmark] - public void BasicBulk() + public void FallbackIntrinsics128() { - ref Rgba32 sBase = ref this.source.GetSpan()[0]; - ref Vector4 dBase = ref this.destination.GetSpan()[0]; - - Vector4 scale = new Vector4(1f / 255f); - - Vector4 v = default; + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - for (int i = 0; i < this.Count; i++) - { - ref Rgba32 s = ref Unsafe.Add(ref sBase, i); - v.X = s.R; - v.Y = s.G; - v.Z = s.B; - v.W = s.A; - v *= scale; - Unsafe.Add(ref dBase, i) = v; - } + SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } [Benchmark(Baseline = true)] - public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat() + public void BasicIntrinsics256() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); @@ -110,7 +97,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } [Benchmark] - public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat() + public void ExtendedIntrinsics() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 2dcba2b74..feefd1758 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common [InlineData(1, 8)] [InlineData(2, 16)] [InlineData(3, 128)] - public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count) + public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count) { if (this.SkipOnNonAvx2()) { @@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common [InlineData(1, 8)] [InlineData(2, 16)] [InlineData(3, 128)] - public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count) + public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count) { if (this.SkipOnNonAvx2()) { @@ -161,6 +161,7 @@ namespace SixLabors.ImageSharp.Tests.Common } public static readonly TheoryData ArraySizesDivisibleBy8 = new TheoryData { 0, 8, 16, 1024 }; + public static readonly TheoryData ArraySizesDivisibleBy4 = new TheoryData { 0, 4, 8, 28, 1020 }; public static readonly TheoryData ArraySizesDivisibleBy32 = new TheoryData { 0, 32, 512 }; @@ -170,9 +171,18 @@ namespace SixLabors.ImageSharp.Tests.Common 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520, }; + [Theory] + [MemberData(nameof(ArraySizesDivisibleBy4))] + public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count) + { + TestImpl_BulkConvertByteToNormalizedFloat( + count, + (s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(s.Span, d.Span)); + } + [Theory] [MemberData(nameof(ArraySizesDivisibleBy8))] - public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count) + public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat(int count) { if (this.SkipOnNonAvx2()) { @@ -215,9 +225,18 @@ namespace SixLabors.ImageSharp.Tests.Common Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); } + [Theory] + [MemberData(nameof(ArraySizesDivisibleBy4))] + public void FallbackIntrinsics128_BulkConvertNormalizedFloatToByteClampOverflows(int count) + { + TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count, + (s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span) + ); + } + [Theory] [MemberData(nameof(ArraySizesDivisibleBy8))] - public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) + public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows(int count) { if (this.SkipOnNonAvx2()) { diff --git a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs index 6c2979fe9..aec4d0b81 100644 --- a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs +++ b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs @@ -9,6 +9,60 @@ namespace SixLabors.ImageSharp.Tests.Helpers public class ImageMathsTests { + [Theory] + [InlineData(0, 0)] + [InlineData(1, 1)] + [InlineData(2, 2)] + [InlineData(3, 3)] + [InlineData(4, 0)] + [InlineData(100, 0)] + [InlineData(123, 3)] + [InlineData(53436353, 1)] + public void Modulo4(int a, int expected) + { + int actual = ImageMaths.Modulo4(a); + Assert.Equal(expected, actual); + } + + [Theory] + [InlineData(0, 0)] + [InlineData(1, 1)] + [InlineData(2, 2)] + [InlineData(6, 6)] + [InlineData(7, 7)] + [InlineData(8, 0)] + [InlineData(100, 4)] + [InlineData(123, 3)] + [InlineData(53436353, 1)] + [InlineData(975, 7)] + public void Modulo8(int a, int expected) + { + int actual = ImageMaths.Modulo8(a); + Assert.Equal(expected, actual); + } + + [Theory] + [InlineData(0, 2, 0)] + [InlineData(1, 2, 1)] + [InlineData(2, 2, 0)] + [InlineData(0, 4, 0)] + [InlineData(3, 4, 3)] + [InlineData(5, 4, 1)] + [InlineData(5, 8, 5)] + [InlineData(8, 8, 0)] + [InlineData(8, 16, 8)] + [InlineData(15, 16, 15)] + [InlineData(17, 16, 1)] + [InlineData(17, 32, 17)] + [InlineData(31, 32, 31)] + [InlineData(32, 32, 0)] + [InlineData(33, 32, 1)] + public void Modulo2P(int a, int m, int expected) + { + int actual = ImageMaths.ModuloP2(a, m); + Assert.Equal(expected, actual); + } + [Fact] public void FasAbsResultMatchesMath() { From bf7c9338960aa5d6846d6062c107e2305c518609 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sun, 21 Oct 2018 21:00:15 +0200 Subject: [PATCH 18/22] minimize ceremonial overhead in BulkConvertByteToNormalizedFloat() and BulkConvertNormalizedFloatToByteClampOverflows() --- src/ImageSharp/Common/Helpers/ImageMaths.cs | 45 +++++---- .../Helpers/SimdUtils.BasicIntrinsics256.cs | 91 ++++++++++-------- .../Helpers/SimdUtils.ExtendedIntrinsics.cs | 96 ++++++++++--------- .../SimdUtils.FallbackIntrinsics128.cs | 56 +++++++---- src/ImageSharp/Common/Helpers/SimdUtils.cs | 84 +++++++++++----- .../Color/Bulk/PackFromVector4.cs | 50 +++++----- .../Color/Bulk/ToVector4.cs | 50 +++++----- .../General/{ => BasicMath}/Abs.cs | 8 +- .../General/{ => BasicMath}/Clamp.cs | 10 +- .../BasicMath/ModuloPowerOfTwoConstant.cs | 23 +++++ .../BasicMath/ModuloPowerOfTwoVariable.cs | 32 +++++++ .../General/{ => BasicMath}/Pow.cs | 3 +- .../ImageSharp.Benchmarks/General/Modulus.cs | 19 ---- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 17 +++- .../Helpers/ImageMathsTests.cs | 94 ++++++++++-------- 15 files changed, 406 insertions(+), 272 deletions(-) rename tests/ImageSharp.Benchmarks/General/{ => BasicMath}/Abs.cs (88%) rename tests/ImageSharp.Benchmarks/General/{ => BasicMath}/Clamp.cs (94%) create mode 100644 tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoConstant.cs create mode 100644 tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoVariable.cs rename tests/ImageSharp.Benchmarks/General/{ => BasicMath}/Pow.cs (93%) delete mode 100644 tests/ImageSharp.Benchmarks/General/Modulus.cs diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs index e4fd9bce6..1395975ec 100644 --- a/src/ImageSharp/Common/Helpers/ImageMaths.cs +++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs @@ -39,22 +39,31 @@ namespace SixLabors.ImageSharp return (a / GreatestCommonDivisor(a, b)) * b; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int Modulo4(int a) => a & 3; + /// + /// Calculates % 4 + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static int Modulo4(int x) => x & 3; - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int Modulo8(int a) => a & 7; + /// + /// Calculates % 8 + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static int Modulo8(int x) => x & 7; /// - /// Fast (mod m) calculator, - /// where should be a power of 2. + /// Fast (x mod m) calculator, with the restriction that + /// should be power of 2. /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int ModuloP2(int a, int m) + [MethodImpl(InliningOptions.ShortMethod)] + public static int ModuloP2(int x, int m) { - return a & (m - 1); + return x & (m - 1); } + [MethodImpl(InliningOptions.ShortMethod)] + public static float Clamp(float x, float min, float max) => Math.Min(max, Math.Max(min, x)); + /// /// Returns the absolute value of a 32-bit signed integer. Uses bit shifting to speed up the operation. /// @@ -62,7 +71,7 @@ namespace SixLabors.ImageSharp /// A number that is greater than , but less than or equal to /// /// The - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(InliningOptions.ShortMethod)] public static int FastAbs(int x) { int y = x >> 31; @@ -74,7 +83,7 @@ namespace SixLabors.ImageSharp /// /// A single-precision floating-point number /// The number raised to the power of 2. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(InliningOptions.ShortMethod)] public static float Pow2(float x) => x * x; /// @@ -82,7 +91,7 @@ namespace SixLabors.ImageSharp /// /// A single-precision floating-point number /// The number raised to the power of 3. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(InliningOptions.ShortMethod)] public static float Pow3(float x) => x * x * x; /// @@ -93,7 +102,7 @@ namespace SixLabors.ImageSharp /// /// The /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(InliningOptions.ShortMethod)] public static int GetBitsNeededForColorDepth(int colors) => Math.Max(1, (int)Math.Ceiling(Math.Log(colors, 2))); /// @@ -101,7 +110,7 @@ namespace SixLabors.ImageSharp /// /// The bit depth. /// The - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(InliningOptions.ShortMethod)] public static int GetColorCountForBitDepth(int bitDepth) => 1 << bitDepth; /// @@ -110,7 +119,7 @@ namespace SixLabors.ImageSharp /// The x provided to G(x). /// The spread of the blur. /// The Gaussian G(x) - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(InliningOptions.ShortMethod)] public static float Gaussian(float x, float sigma) { const float Numerator = 1.0f; @@ -133,7 +142,7 @@ namespace SixLabors.ImageSharp /// /// The sine cardinal of . /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(InliningOptions.ShortMethod)] public static float SinC(float f) { if (MathF.Abs(f) > Constants.Epsilon) @@ -156,7 +165,7 @@ namespace SixLabors.ImageSharp /// /// The . /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(InliningOptions.ShortMethod)] public static float GetBcValue(float x, float b, float c) { if (x < 0F) @@ -192,7 +201,7 @@ namespace SixLabors.ImageSharp /// /// The bounding . /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(InliningOptions.ShortMethod)] public static Rectangle GetBoundingRectangle(Point topLeft, Point bottomRight) => new Rectangle(topLeft.X, topLeft.Y, bottomRight.X - topLeft.X, bottomRight.Y - topLeft.Y); /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs index c7fd21a8f..713d606e7 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs @@ -21,28 +21,58 @@ namespace SixLabors.ImageSharp public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture; /// - /// as much elements as possible, slicing them down (keeping the remainder). + /// as many elements as possible, slicing them down (keeping the remainder). /// + [MethodImpl(InliningOptions.ShortMethod)] internal static void BulkConvertByteToNormalizedFloatReduce( ref ReadOnlySpan source, ref Span dest) { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); - if (IsAvailable) + if (!IsAvailable) { - int remainder = source.Length % 8; - int alignedCount = source.Length - remainder; - - if (alignedCount > 0) - { - BulkConvertByteToNormalizedFloat( - source.Slice(0, alignedCount), - dest.Slice(0, alignedCount)); - - source = source.Slice(alignedCount); - dest = dest.Slice(alignedCount); - } + return; + } + + int remainder = ImageMaths.Modulo8(source.Length); + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + BulkConvertByteToNormalizedFloat( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount)); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + + /// + /// as many elements as possible, slicing them down (keeping the remainder). + /// + [MethodImpl(InliningOptions.ShortMethod)] + internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + if (!IsAvailable) + { + return; + } + + int remainder = ImageMaths.Modulo8(source.Length); + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount)); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); } } @@ -57,7 +87,7 @@ namespace SixLabors.ImageSharp { GuardAvx2(nameof(BulkConvertByteToNormalizedFloat)); - DebugGuard.IsTrue((dest.Length % 8) == 0, nameof(source), "dest.Length should be divisable by 8!"); + DebugGuard.IsTrue(ImageMaths.Modulo8(dest.Length) == 0, nameof(source), "dest.Length should be divisable by 8!"); var bVec = new Vector(256.0f / 255.0f); var magicFloat = new Vector(32768.0f); @@ -93,30 +123,6 @@ namespace SixLabors.ImageSharp } } - /// - /// as much elements as possible, slicing them down (keeping the remainder). - /// - internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( - ref ReadOnlySpan source, - ref Span dest) - { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); - - if (IsAvailable) - { - int remainder = source.Length % Vector.Count; - int alignedCount = source.Length - remainder; - - if (alignedCount > 0) - { - BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount)); - - source = source.Slice(alignedCount); - dest = dest.Slice(alignedCount); - } - } - } - /// /// Implementation of which is faster on older runtimes. /// @@ -124,7 +130,7 @@ namespace SixLabors.ImageSharp { GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows)); - DebugGuard.IsTrue((source.Length % 8) == 0, nameof(source), "source.Length should be divisible by 8!"); + DebugGuard.IsTrue(ImageMaths.Modulo8(source.Length) == 0, nameof(source), "source.Length should be divisible by 8!"); if (source.Length == 0) { @@ -174,7 +180,10 @@ namespace SixLabors.ImageSharp { GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); - DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!"); + DebugGuard.IsTrue( + ImageMaths.Modulo8(source.Length) == 0, + nameof(source), + "source.Length should be divisible by 8!"); if (source.Length == 0) { diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index 996a08fb4..dfa6f189c 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -28,27 +28,58 @@ namespace SixLabors.ImageSharp #endif /// - /// as much elements as possible, slicing them down (keeping the remainder). + /// as many elements as possible, slicing them down (keeping the remainder). /// - [Conditional("NETCOREAPP2_1")] + [MethodImpl(InliningOptions.ShortMethod)] internal static void BulkConvertByteToNormalizedFloatReduce( ref ReadOnlySpan source, ref Span dest) { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); - if (IsAvailable) + if (!IsAvailable) { - int remainder = source.Length % Vector.Count; - int alignedCount = source.Length - remainder; + return; + } + + int remainder = ImageMaths.ModuloP2(source.Length, Vector.Count); + int adjustedCount = source.Length - remainder; - if (alignedCount > 0) - { - BulkConvertByteToNormalizedFloat(source.Slice(0, alignedCount), dest.Slice(0, alignedCount)); + if (adjustedCount > 0) + { + BulkConvertByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount)); - source = source.Slice(alignedCount); - dest = dest.Slice(alignedCount); - } + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + + /// + /// as many elements as possible, slicing them down (keeping the remainder). + /// + [MethodImpl(InliningOptions.ShortMethod)] + internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + + if (!IsAvailable) + { + return; + } + + int remainder = ImageMaths.ModuloP2(source.Length, Vector.Count); + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + BulkConvertNormalizedFloatToByteClampOverflows( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount)); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); } } @@ -58,7 +89,7 @@ namespace SixLabors.ImageSharp internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { DebugGuard.IsTrue( - dest.Length % Vector.Count == 0, + ImageMaths.ModuloP2(dest.Length, Vector.Count) == 0, nameof(source), "dest.Length should be divisible by Vector.Count!"); @@ -67,8 +98,6 @@ namespace SixLabors.ImageSharp ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - var scale = new Vector(1f / 255f); - for (int i = 0; i < n; i++) { Vector b = Unsafe.Add(ref sourceBase, i); @@ -77,10 +106,10 @@ namespace SixLabors.ImageSharp Vector.Widen(s0, out Vector w0, out Vector w1); Vector.Widen(s1, out Vector w2, out Vector w3); - Vector f0 = ConvertToSingle(w0, scale); - Vector f1 = ConvertToSingle(w1, scale); - Vector f2 = ConvertToSingle(w2, scale); - Vector f3 = ConvertToSingle(w3, scale); + Vector f0 = ConvertToSingle(w0); + Vector f1 = ConvertToSingle(w1); + Vector f2 = ConvertToSingle(w2); + Vector f3 = ConvertToSingle(w3); ref Vector d = ref Unsafe.Add(ref destBase, i * 4); d = f0; @@ -90,31 +119,6 @@ namespace SixLabors.ImageSharp } } - /// - /// as much elements as possible, slicing them down (keeping the remainder). - /// - [Conditional("NETCOREAPP2_1")] - internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( - ref ReadOnlySpan source, - ref Span dest) - { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); - - if (IsAvailable) - { - int remainder = source.Length % Vector.Count; - int alignedCount = source.Length - remainder; - - if (alignedCount > 0) - { - BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount)); - - source = source.Slice(alignedCount); - dest = dest.Slice(alignedCount); - } - } - } - /// /// Implementation of , which is faster on new .NET runtime. /// @@ -123,7 +127,7 @@ namespace SixLabors.ImageSharp Span dest) { DebugGuard.IsTrue( - dest.Length % Vector.Count == 0, + ImageMaths.ModuloP2(dest.Length, Vector.Count) == 0, nameof(dest), "dest.Length should be divisible by Vector.Count!"); @@ -168,11 +172,11 @@ namespace SixLabors.ImageSharp } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector ConvertToSingle(Vector u, Vector scale) + private static Vector ConvertToSingle(Vector u) { Vector vi = Vector.AsVectorInt32(u); Vector v = Vector.ConvertToSingle(vi); - v *= scale; + v *= new Vector(1f / 255f); return v; } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs index bb2147466..2d9f53eaf 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs @@ -1,71 +1,81 @@ -using System; +// Copyright (c) Six Labors and contributors. +// Licensed under the Apache License, Version 2.0. + +using System; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +// ReSharper disable MemberHidesStaticFromOuterClass namespace SixLabors.ImageSharp { internal static partial class SimdUtils { /// /// Fallback implementation based on (128bit). - /// For , efficient software fallback implementations are present - /// + maybe even mono can emit intrinsics for that type :P + /// For , efficient software fallback implementations are present, + /// and we hope that even mono's JIT is able to emit SIMD instructions for that type :P /// public static class FallbackIntrinsics128 { /// - /// as much elements as possible, slicing them down (keeping the remainder). + /// as many elements as possible, slicing them down (keeping the remainder). /// + [MethodImpl(InliningOptions.ShortMethod)] internal static void BulkConvertByteToNormalizedFloatReduce( ref ReadOnlySpan source, ref Span dest) { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); - int remainder = source.Length % 4; - int alignedCount = source.Length - remainder; + int remainder = ImageMaths.Modulo4(source.Length); + int adjustedCount = source.Length - remainder; - if (alignedCount > 0) + if (adjustedCount > 0) { BulkConvertByteToNormalizedFloat( - source.Slice(0, alignedCount), - dest.Slice(0, alignedCount)); + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount)); - source = source.Slice(alignedCount); - dest = dest.Slice(alignedCount); + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); } } /// - /// as much elements as possible, slicing them down (keeping the remainder). + /// as many elements as possible, slicing them down (keeping the remainder). /// + [MethodImpl(InliningOptions.ShortMethod)] internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( ref ReadOnlySpan source, ref Span dest) { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); - int remainder = source.Length % 4; - int alignedCount = source.Length - remainder; + int remainder = ImageMaths.Modulo4(source.Length); + int adjustedCount = source.Length - remainder; - if (alignedCount > 0) + if (adjustedCount > 0) { BulkConvertNormalizedFloatToByteClampOverflows( - source.Slice(0, alignedCount), - dest.Slice(0, alignedCount)); + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount)); - source = source.Slice(alignedCount); - dest = dest.Slice(alignedCount); + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); } } /// /// Implementation of using . /// + [MethodImpl(InliningOptions.ColdPath)] internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { - DebugGuard.IsTrue((dest.Length % 4) == 0, nameof(dest), "dest.Length should be divisible by 4!"); + DebugGuard.IsTrue( + ImageMaths.Modulo4(dest.Length) == 0, + nameof(dest), + "dest.Length should be divisible by 4!"); int count = dest.Length / 4; if (count == 0) @@ -94,11 +104,15 @@ namespace SixLabors.ImageSharp /// /// Implementation of using . /// + [MethodImpl(InliningOptions.ColdPath)] internal static void BulkConvertNormalizedFloatToByteClampOverflows( ReadOnlySpan source, Span dest) { - DebugGuard.IsTrue((source.Length % 4) == 0, nameof(source), "source.Length should be divisible by 4!"); + DebugGuard.IsTrue( + ImageMaths.Modulo4(source.Length) == 0, + nameof(source), + "source.Length should be divisible by 4!"); int count = source.Length / 4; if (count == 0) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index bc75dc8ca..95a6030fd 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -2,6 +2,7 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -61,25 +62,22 @@ namespace SixLabors.ImageSharp /// /// The source span of bytes /// The destination span of floats + [MethodImpl(InliningOptions.ShortMethod)] internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); +#if NETCOREAPP2_1 ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); +#else BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); +#endif FallbackIntrinsics128.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); // Deal with the remainder: - int count = source.Length; - if (count > 0) + if (source.Length > 0) { - // TODO: Do we need to optimize anything on this? (There are at most 7 remainders) - ref byte sBase = ref MemoryMarshal.GetReference(source); - ref float dBase = ref MemoryMarshal.GetReference(dest); - for (int i = 0; i < count; i++) - { - Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, i) / 255f; - } + ConverByteToNormalizedFloatRemainder(source, dest); } } @@ -91,35 +89,71 @@ namespace SixLabors.ImageSharp /// /// The source span of floats /// The destination span of bytes + [MethodImpl(InliningOptions.ShortMethod)] internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); +#if NETCOREAPP2_1 ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); +#else BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); +#endif FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); // Deal with the remainder: - int count = source.Length; - if (count > 0) + if (source.Length > 0) { - ref float sBase = ref MemoryMarshal.GetReference(source); - ref byte dBase = ref MemoryMarshal.GetReference(dest); - - for (int i = 0; i < count; i++) - { - // TODO: Do we need to optimize anything on this? (There are at most 7 remainders) - float f = Unsafe.Add(ref sBase, i); - f *= 255f; - f += 0.5f; - f = MathF.Max(0, f); - f = MathF.Min(255f, f); - - Unsafe.Add(ref dBase, i) = (byte)f; - } + ConvertNormalizedFloatToByteRemainder(source, dest); } } + [MethodImpl(InliningOptions.ColdPath)] + private static void ConverByteToNormalizedFloatRemainder(ReadOnlySpan source, Span dest) + { + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref float dBase = ref MemoryMarshal.GetReference(dest); + + // There are at most 3 elements at this point, having a for loop is overkill. + // Let's minimize the no. of instructions! + switch (source.Length) + { + case 3: + Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2) / 255f; + goto case 2; + case 2: + Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1) / 255f; + goto case 1; + case 1: + dBase = sBase / 255f; + break; + } + } + + [MethodImpl(InliningOptions.ColdPath)] + private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan source, Span dest) + { + ref float sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); + + switch (source.Length) + { + case 3: + Unsafe.Add(ref dBase, 2) = ConvertToByte(Unsafe.Add(ref sBase, 2)); + goto case 2; + case 2: + Unsafe.Add(ref dBase, 1) = ConvertToByte(Unsafe.Add(ref sBase, 1)); + goto case 1; + case 1: + dBase = ConvertToByte(sBase); + break; + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static byte ConvertToByte(float f) => (byte)ImageMaths.Clamp((f * 255f) + 0.5f, 0, 255f); + + [Conditional("DEBUG")] private static void GuardAvx2(string operation) { if (!IsAvx2CompatibleArchitecture) diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index a56082fcd..eaa52a975 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -99,30 +99,30 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } // RESULTS (2018 October): - // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated | - // ------------------------------------------------------------------ |-------- |------ |-------------:|-------------:|-----------:|-------:|---------:|-------:|----------:| - // BasicBulk | Clr | 64 | 581.62 ns | 33.625 ns | 1.8999 ns | 2.27 | 0.02 | - | 0 B | - // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 64 | 256.66 ns | 45.153 ns | 2.5512 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 64 | 201.92 ns | 30.161 ns | 1.7042 ns | 0.79 | 0.01 | - | 0 B | - // PixelOperations_Base | Clr | 64 | 665.01 ns | 13.032 ns | 0.7363 ns | 2.59 | 0.02 | 0.0067 | 24 B | - // PixelOperations_Specialized | Clr | 64 | 295.14 ns | 26.335 ns | 1.4880 ns | 1.15 | 0.01 | - | 0 B | - // | | | | | | | | | | - // BasicBulk | Core | 64 | 513.22 ns | 91.110 ns | 5.1479 ns | 3.19 | 0.03 | - | 0 B | - // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Core | 64 | 160.76 ns | 2.760 ns | 0.1559 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 64 | 95.98 ns | 10.077 ns | 0.5694 ns | 0.60 | 0.00 | - | 0 B | - // PixelOperations_Base | Core | 64 | 591.74 ns | 49.856 ns | 2.8170 ns | 3.68 | 0.01 | 0.0067 | 24 B | - // PixelOperations_Specialized | Core | 64 | 149.11 ns | 4.485 ns | 0.2534 ns | 0.93 | 0.00 | - | 0 B | - // | | | | | | | | | | - // BasicBulk | Clr | 2048 | 15,345.85 ns | 1,213.551 ns | 68.5679 ns | 3.90 | 0.01 | - | 0 B | - // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3,939.49 ns | 71.101 ns | 4.0173 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 2,272.61 ns | 110.671 ns | 6.2531 ns | 0.58 | 0.00 | - | 0 B | - // PixelOperations_Base | Clr | 2048 | 17,422.47 ns | 811.733 ns | 45.8644 ns | 4.42 | 0.01 | - | 24 B | - // PixelOperations_Specialized | Clr | 2048 | 3,984.26 ns | 110.352 ns | 6.2351 ns | 1.01 | 0.00 | - | 0 B | - // | | | | | | | | | | - // BasicBulk | Core | 2048 | 14,950.43 ns | 699.309 ns | 39.5123 ns | 3.76 | 0.02 | - | 0 B | - // BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 3,978.28 ns | 481.105 ns | 27.1833 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 2,169.54 ns | 75.606 ns | 4.2719 ns | !!0.55!| 0.00 | - | 0 B | - // PixelOperations_Base | Core | 2048 | 18,403.62 ns | 1,494.056 ns | 84.4169 ns | 4.63 | 0.03 | - | 24 B | - // PixelOperations_Specialized | Core | 2048 | 2,227.60 ns | 486.761 ns | 27.5029 ns | !!0.56!| 0.01 | - | 0 B | + // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated | + // ---------------------------- |-------- |------ |-------------:|-------------:|------------:|-------:|---------:|-------:|----------:| + // FallbackIntrinsics128 | Clr | 64 | 340.38 ns | 22.319 ns | 1.2611 ns | 1.41 | 0.01 | - | 0 B | + // BasicIntrinsics256 | Clr | 64 | 240.79 ns | 11.421 ns | 0.6453 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic | Clr | 64 | 199.09 ns | 124.239 ns | 7.0198 ns | 0.83 | 0.02 | - | 0 B | + // PixelOperations_Base | Clr | 64 | 647.99 ns | 24.003 ns | 1.3562 ns | 2.69 | 0.01 | 0.0067 | 24 B | + // PixelOperations_Specialized | Clr | 64 | 259.79 ns | 13.391 ns | 0.7566 ns | 1.08 | 0.00 | - | 0 B | <--- ceremonial overhead has been minimized! + // | | | | | | | | | | + // FallbackIntrinsics128 | Core | 64 | 234.64 ns | 12.320 ns | 0.6961 ns | 1.58 | 0.00 | - | 0 B | + // BasicIntrinsics256 | Core | 64 | 148.87 ns | 2.794 ns | 0.1579 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic | Core | 64 | 94.06 ns | 10.015 ns | 0.5659 ns | 0.63 | 0.00 | - | 0 B | + // PixelOperations_Base | Core | 64 | 573.52 ns | 31.865 ns | 1.8004 ns | 3.85 | 0.01 | 0.0067 | 24 B | + // PixelOperations_Specialized | Core | 64 | 117.21 ns | 13.264 ns | 0.7494 ns | 0.79 | 0.00 | - | 0 B | + // | | | | | | | | | | + // FallbackIntrinsics128 | Clr | 2048 | 6,735.93 ns | 2,139.340 ns | 120.8767 ns | 1.71 | 0.03 | - | 0 B | + // BasicIntrinsics256 | Clr | 2048 | 3,929.29 ns | 334.027 ns | 18.8731 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic | Clr | 2048 | 2,226.01 ns | 130.525 ns | 7.3749 ns |!! 0.57 | 0.00 | - | 0 B | <--- ExtendedIntrinsics rock! + // PixelOperations_Base | Clr | 2048 | 16,760.84 ns | 367.800 ns | 20.7814 ns | 4.27 | 0.02 | - | 24 B | <--- Extra copies using "Vector4 TPixel.ToVector4()" + // PixelOperations_Specialized | Clr | 2048 | 3,986.03 ns | 237.238 ns | 13.4044 ns | 1.01 | 0.00 | - | 0 B | <--- can't yet detect whether ExtendedIntrinsics are available :( + // | | | | | | | | | | + // FallbackIntrinsics128 | Core | 2048 | 6,644.65 ns | 2,677.090 ns | 151.2605 ns | 1.69 | 0.05 | - | 0 B | + // BasicIntrinsics256 | Core | 2048 | 3,923.70 ns | 1,971.760 ns | 111.4081 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsic | Core | 2048 | 2,092.32 ns | 375.657 ns | 21.2253 ns |!! 0.53 | 0.01 | - | 0 B | <--- ExtendedIntrinsics rock! + // PixelOperations_Base | Core | 2048 | 16,875.73 ns | 1,271.957 ns | 71.8679 ns | 4.30 | 0.10 | - | 24 B | + // PixelOperations_Specialized | Core | 2048 | 2,129.92 ns | 262.888 ns | 14.8537 ns |!! 0.54 | 0.01 | - | 0 B | <--- ExtendedIntrinsics rock! } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 519edaa31..2cbe549e4 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -191,30 +191,30 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk // RESULTS (2018 October): // - // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated | - // ---------------------------------------------------- |-------- |------ |------------:|-------------:|-----------:|-------:|---------:|-------:|----------:| - // BasicBulk | Clr | 64 | 267.40 ns | 30.711 ns | 1.7352 ns | 1.07 | 0.01 | - | 0 B | - // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Clr | 64 | 249.97 ns | 33.838 ns | 1.9119 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Clr | 64 | 176.97 ns | 5.221 ns | 0.2950 ns | 0.71 | 0.00 | - | 0 B | - // PixelOperations_Base | Clr | 64 | 349.70 ns | 104.331 ns | 5.8949 ns | 1.40 | 0.02 | 0.0072 | 24 B | - // PixelOperations_Specialized | Clr | 64 | 288.31 ns | 26.833 ns | 1.5161 ns | 1.15 | 0.01 | - | 0 B | - // | | | | | | | | | | - // BasicBulk | Core | 64 | 185.36 ns | 30.051 ns | 1.6979 ns | 1.26 | 0.01 | - | 0 B | - // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Core | 64 | 146.84 ns | 12.674 ns | 0.7161 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Core | 64 | 67.31 ns | 2.542 ns | 0.1436 ns | 0.46 | 0.00 | - | 0 B | - // PixelOperations_Base | Core | 64 | 272.03 ns | 94.419 ns | 5.3348 ns | 1.85 | 0.03 | 0.0072 | 24 B | - // PixelOperations_Specialized | Core | 64 | 121.91 ns | 31.477 ns | 1.7785 ns | 0.83 | 0.01 | - | 0 B | - // | | | | | | | | | | - // BasicBulk | Clr | 2048 | 5,133.04 ns | 284.052 ns | 16.0494 ns | 1.21 | 0.01 | - | 0 B | - // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Clr | 2048 | 4,248.58 ns | 1,095.887 ns | 61.9196 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Clr | 2048 | 1,214.02 ns | 184.349 ns | 10.4160 ns | 0.29 | 0.00 | - | 0 B | - // PixelOperations_Base | Clr | 2048 | 7,096.04 ns | 362.350 ns | 20.4734 ns | 1.67 | 0.02 | - | 24 B | - // PixelOperations_Specialized | Clr | 2048 | 4,314.19 ns | 204.964 ns | 11.5809 ns | 1.02 | 0.01 | - | 0 B | - // | | | | | | | | | | - // BasicBulk | Core | 2048 | 5,038.38 ns | 223.282 ns | 12.6158 ns | 1.20 | 0.01 | - | 0 B | - // BasicIntrinsics256_BulkConvertByteToNormalizedFloat | Core | 2048 | 4,199.17 ns | 897.985 ns | 50.7378 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsics_BulkConvertByteToNormalizedFloat | Core | 2048 | 1,113.86 ns | 64.799 ns | 3.6613 ns | !!0.27!| 0.00 | - | 0 B | - // PixelOperations_Base | Core | 2048 | 7,015.00 ns | 920.083 ns | 51.9864 ns | 1.67 | 0.02 | - | 24 B | - // PixelOperations_Specialized | Core | 2048 | 1,176.59 ns | 256.955 ns | 14.5184 ns | !!0.28!| 0.00 | - | 0 B | + // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated | + // ---------------------------- |-------- |------ |------------:|-------------:|------------:|-------:|---------:|-------:|----------:| + // FallbackIntrinsics128 | Clr | 64 | 287.62 ns | 6.026 ns | 0.3405 ns | 1.19 | 0.00 | - | 0 B | + // BasicIntrinsics256 | Clr | 64 | 240.83 ns | 10.585 ns | 0.5981 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics | Clr | 64 | 168.28 ns | 11.478 ns | 0.6485 ns | 0.70 | 0.00 | - | 0 B | + // PixelOperations_Base | Clr | 64 | 334.08 ns | 38.048 ns | 2.1498 ns | 1.39 | 0.01 | 0.0072 | 24 B | + // PixelOperations_Specialized | Clr | 64 | 255.41 ns | 10.939 ns | 0.6181 ns | 1.06 | 0.00 | - | 0 B | <--- ceremonial overhead has been minimized! + // | | | | | | | | | | + // FallbackIntrinsics128 | Core | 64 | 183.29 ns | 8.931 ns | 0.5046 ns | 1.32 | 0.00 | - | 0 B | + // BasicIntrinsics256 | Core | 64 | 139.18 ns | 7.633 ns | 0.4313 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics | Core | 64 | 66.29 ns | 16.366 ns | 0.9247 ns | 0.48 | 0.01 | - | 0 B | + // PixelOperations_Base | Core | 64 | 257.75 ns | 16.959 ns | 0.9582 ns | 1.85 | 0.01 | 0.0072 | 24 B | + // PixelOperations_Specialized | Core | 64 | 90.14 ns | 9.955 ns | 0.5625 ns | 0.65 | 0.00 | - | 0 B | + // | | | | | | | | | | + // FallbackIntrinsics128 | Clr | 2048 | 5,011.84 ns | 347.991 ns | 19.6621 ns | 1.22 | 0.01 | - | 0 B | + // BasicIntrinsics256 | Clr | 2048 | 4,119.35 ns | 720.153 ns | 40.6900 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics | Clr | 2048 | 1,195.29 ns | 164.389 ns | 9.2883 ns |!! 0.29 | 0.00 | - | 0 B | <--- ExtendedIntrinsics rock! + // PixelOperations_Base | Clr | 2048 | 6,820.58 ns | 823.433 ns | 46.5255 ns | 1.66 | 0.02 | - | 24 B | + // PixelOperations_Specialized | Clr | 2048 | 4,203.53 ns | 176.714 ns | 9.9847 ns | 1.02 | 0.01 | - | 0 B | <--- can't yet detect whether ExtendedIntrinsics are available :( + // | | | | | | | | | | + // FallbackIntrinsics128 | Core | 2048 | 5,017.89 ns | 4,021.533 ns | 227.2241 ns | 1.24 | 0.05 | - | 0 B | + // BasicIntrinsics256 | Core | 2048 | 4,046.51 ns | 1,150.390 ns | 64.9992 ns | 1.00 | 0.00 | - | 0 B | + // ExtendedIntrinsics | Core | 2048 | 1,130.59 ns | 832.588 ns | 47.0427 ns |!! 0.28 | 0.01 | - | 0 B | <--- ExtendedIntrinsics rock! + // PixelOperations_Base | Core | 2048 | 6,752.68 ns | 272.820 ns | 15.4148 ns | 1.67 | 0.02 | - | 24 B | + // PixelOperations_Specialized | Core | 2048 | 1,126.13 ns | 79.192 ns | 4.4745 ns |!! 0.28 | 0.00 | - | 0 B | <--- ExtendedIntrinsics rock! } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/General/Abs.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/Abs.cs similarity index 88% rename from tests/ImageSharp.Benchmarks/General/Abs.cs rename to tests/ImageSharp.Benchmarks/General/BasicMath/Abs.cs index a67f3f107..ea53959b6 100644 --- a/tests/ImageSharp.Benchmarks/General/Abs.cs +++ b/tests/ImageSharp.Benchmarks/General/BasicMath/Abs.cs @@ -1,9 +1,9 @@ -namespace SixLabors.ImageSharp.Benchmarks.General -{ - using System; +using System; - using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Attributes; +namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath +{ public class Abs { [Params(-1, 1)] diff --git a/tests/ImageSharp.Benchmarks/General/Clamp.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/Clamp.cs similarity index 94% rename from tests/ImageSharp.Benchmarks/General/Clamp.cs rename to tests/ImageSharp.Benchmarks/General/BasicMath/Clamp.cs index ef6bc3c40..d486cb2f3 100644 --- a/tests/ImageSharp.Benchmarks/General/Clamp.cs +++ b/tests/ImageSharp.Benchmarks/General/BasicMath/Clamp.cs @@ -3,13 +3,13 @@ // Licensed under the Apache License, Version 2.0. // -namespace SixLabors.ImageSharp.Benchmarks.General -{ - using System; - using System.Runtime.CompilerServices; +using System; +using System.Runtime.CompilerServices; - using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Attributes; +namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath +{ public class Clamp { [Params(-1, 0, 255, 256)] diff --git a/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoConstant.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoConstant.cs new file mode 100644 index 000000000..9ddfad722 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoConstant.cs @@ -0,0 +1,23 @@ +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Attributes.Jobs; + +namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath +{ + [LongRunJob] + public class ModuloPowerOfTwoConstant + { + private readonly int value = 42; + + [Benchmark(Baseline = true)] + public int Standard() + { + return this.value % 8; + } + + [Benchmark] + public int Bitwise() + { + return ImageMaths.Modulo8(this.value); + } + } +} \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoVariable.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoVariable.cs new file mode 100644 index 000000000..5c2fe81fa --- /dev/null +++ b/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoVariable.cs @@ -0,0 +1,32 @@ +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Attributes.Jobs; + +namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath +{ + [LongRunJob] + public class ModuloPowerOfTwoVariable + { + private readonly int value = 42; + + private readonly int m = 32; + + [Benchmark(Baseline = true)] + public int Standard() + { + return this.value % this.m; + } + + [Benchmark] + public int Bitwise() + { + return ImageMaths.ModuloP2(this.value, this.m); + } + + // RESULTS: + // + // Method | Mean | Error | StdDev | Median | Scaled | ScaledSD | + // --------- |----------:|----------:|----------:|----------:|-------:|---------:| + // Standard | 1.2465 ns | 0.0093 ns | 0.0455 ns | 1.2423 ns | 1.00 | 0.00 | + // Bitwise | 0.0265 ns | 0.0103 ns | 0.0515 ns | 0.0000 ns | 0.02 | 0.04 | + } +} \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/General/Pow.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/Pow.cs similarity index 93% rename from tests/ImageSharp.Benchmarks/General/Pow.cs rename to tests/ImageSharp.Benchmarks/General/BasicMath/Pow.cs index 325bd9d20..0f256fc78 100644 --- a/tests/ImageSharp.Benchmarks/General/Pow.cs +++ b/tests/ImageSharp.Benchmarks/General/BasicMath/Pow.cs @@ -1,7 +1,8 @@ using System; + using BenchmarkDotNet.Attributes; -namespace SixLabors.ImageSharp.Benchmarks.General +namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath { public class Pow { diff --git a/tests/ImageSharp.Benchmarks/General/Modulus.cs b/tests/ImageSharp.Benchmarks/General/Modulus.cs deleted file mode 100644 index e6d5ccce6..000000000 --- a/tests/ImageSharp.Benchmarks/General/Modulus.cs +++ /dev/null @@ -1,19 +0,0 @@ -namespace SixLabors.ImageSharp.Benchmarks.General -{ - using BenchmarkDotNet.Attributes; - - public class Modulus - { - [Benchmark(Baseline = true, Description = "Standard Modulus using %")] - public int StandardModulus() - { - return 255 % 256; - } - - [Benchmark(Description = "Bitwise Modulus using &")] - public int BitwiseModulus() - { - return 255 & 255; - } - } -} diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index feefd1758..c63cb3438 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -264,13 +264,26 @@ namespace SixLabors.ImageSharp.Tests.Common TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count, (s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span) ); + + // for small values, let's stress test the implementation a bit: + if (count > 0 && count < 10) + { + for (int i = 0; i < 20; i++) + { + TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( + count, + (s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span), + i + 42); + } + } } private static void TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( int count, - Action, Memory> convert) + Action, Memory> convert, int seed = -1) { - float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f); + seed = seed > 0 ? seed : count; + float[] source = new Random(seed).GenerateRandomFloatArray(count, -0.2f, 1.2f); byte[] expected = source.Select(NormalizedFloatToByte).ToArray(); byte[] actual = new byte[count]; diff --git a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs index aec4d0b81..d8b1525be 100644 --- a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs +++ b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs @@ -10,56 +10,70 @@ namespace SixLabors.ImageSharp.Tests.Helpers public class ImageMathsTests { [Theory] - [InlineData(0, 0)] - [InlineData(1, 1)] - [InlineData(2, 2)] - [InlineData(3, 3)] - [InlineData(4, 0)] - [InlineData(100, 0)] - [InlineData(123, 3)] - [InlineData(53436353, 1)] - public void Modulo4(int a, int expected) + [InlineData(0)] + [InlineData(1)] + [InlineData(2)] + [InlineData(3)] + [InlineData(4)] + [InlineData(100)] + [InlineData(123)] + [InlineData(53436353)] + public void Modulo4(int x) { - int actual = ImageMaths.Modulo4(a); - Assert.Equal(expected, actual); + int actual = ImageMaths.Modulo4(x); + Assert.Equal(x % 4, actual); } [Theory] - [InlineData(0, 0)] - [InlineData(1, 1)] + [InlineData(0)] + [InlineData(1)] + [InlineData(2)] + [InlineData(6)] + [InlineData(7)] + [InlineData(8)] + [InlineData(100)] + [InlineData(123)] + [InlineData(53436353)] + [InlineData(975)] + public void Modulo8(int x) + { + int actual = ImageMaths.Modulo8(x); + Assert.Equal(x % 8, actual); + } + + [Theory] + [InlineData(0, 2)] + [InlineData(1, 2)] [InlineData(2, 2)] - [InlineData(6, 6)] - [InlineData(7, 7)] - [InlineData(8, 0)] - [InlineData(100, 4)] - [InlineData(123, 3)] - [InlineData(53436353, 1)] - [InlineData(975, 7)] - public void Modulo8(int a, int expected) + [InlineData(0, 4)] + [InlineData(3, 4)] + [InlineData(5, 4)] + [InlineData(5, 8)] + [InlineData(8, 8)] + [InlineData(8, 16)] + [InlineData(15, 16)] + [InlineData(17, 16)] + [InlineData(17, 32)] + [InlineData(31, 32)] + [InlineData(32, 32)] + [InlineData(33, 32)] + public void Modulo2P(int x, int m) { - int actual = ImageMaths.Modulo8(a); - Assert.Equal(expected, actual); + int actual = ImageMaths.ModuloP2(x, m); + Assert.Equal(x % m, actual); } [Theory] - [InlineData(0, 2, 0)] - [InlineData(1, 2, 1)] - [InlineData(2, 2, 0)] - [InlineData(0, 4, 0)] - [InlineData(3, 4, 3)] - [InlineData(5, 4, 1)] - [InlineData(5, 8, 5)] - [InlineData(8, 8, 0)] - [InlineData(8, 16, 8)] - [InlineData(15, 16, 15)] - [InlineData(17, 16, 1)] - [InlineData(17, 32, 17)] - [InlineData(31, 32, 31)] - [InlineData(32, 32, 0)] - [InlineData(33, 32, 1)] - public void Modulo2P(int a, int m, int expected) + [InlineData(0, 0, 0, 0)] + [InlineData(0.5f, 0, 1, 0.5f)] + [InlineData(-0.5f, -0.1f, 10, -0.1f)] + [InlineData(-0.05f, -0.1f, 10, -0.05f)] + [InlineData(9.9f, -0.1f, 10, 9.9f)] + [InlineData(10f, -0.1f, 10, 10f)] + [InlineData(10.1f, -0.1f, 10, 10f)] + public void Clamp(float x, float min, float max, float expected) { - int actual = ImageMaths.ModuloP2(a, m); + float actual = ImageMaths.Clamp(x, min, max); Assert.Equal(expected, actual); } From 520c6fc564c7748f73e9e7e64c483f48f0e2490f Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sun, 21 Oct 2018 21:09:40 +0200 Subject: [PATCH 19/22] fix comment --- .../Common/Helpers/SimdUtils.FallbackIntrinsics128.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs index 2d9f53eaf..ab18a0067 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs @@ -132,7 +132,7 @@ namespace SixLabors.ImageSharp s *= maxBytes; s += half; - // I'm not sure if Clamp() is properly implemented with intrinsics. + // I'm not sure if Vector4.Clamp() is properly implemented with intrinsics. s = Vector4.Max(Vector4.Zero, s); s = Vector4.Min(maxBytes, s); From 5c687fa004e32ff8114c58e309070fc2f4ea2ca5 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sun, 21 Oct 2018 23:38:12 +0200 Subject: [PATCH 20/22] address review findings + some more cleanup --- .../Helpers/SimdUtils.BasicIntrinsics256.cs | 28 ++++++++----------- .../Helpers/SimdUtils.ExtendedIntrinsics.cs | 14 +++------- .../SimdUtils.FallbackIntrinsics128.cs | 14 +++------- src/ImageSharp/Common/Helpers/SimdUtils.cs | 22 ++++++++++++++- src/ImageSharp/Common/Tuples/Octet.cs | 13 +++++++-- src/ImageSharp/Common/Tuples/Vector4Pair.cs | 10 +++---- .../JpegColorConverter.FromYCbCrSimd.cs | 2 +- .../JpegColorConverter.FromYCbCrSimdAvx2.cs | 2 +- .../ColorConverters/JpegColorConverter.cs | 4 +-- .../PixelFormats/PixelOperations{TPixel}.cs | 10 +++---- 10 files changed, 64 insertions(+), 55 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs index 713d606e7..0f1ce2ab6 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs @@ -28,7 +28,7 @@ namespace SixLabors.ImageSharp ref ReadOnlySpan source, ref Span dest) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); if (!IsAvailable) { @@ -57,7 +57,7 @@ namespace SixLabors.ImageSharp ref ReadOnlySpan source, ref Span dest) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); if (!IsAvailable) { @@ -78,16 +78,15 @@ namespace SixLabors.ImageSharp /// /// SIMD optimized implementation for . - /// Works only with `dest.Length` divisible by 8. + /// Works only with span Length divisible by 8. /// Implementation adapted from: /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions /// http://stackoverflow.com/a/536278 /// internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { - GuardAvx2(nameof(BulkConvertByteToNormalizedFloat)); - - DebugGuard.IsTrue(ImageMaths.Modulo8(dest.Length) == 0, nameof(source), "dest.Length should be divisable by 8!"); + VerifyIsAvx2Compatible(nameof(BulkConvertByteToNormalizedFloat)); + VerifySpanInput(source, dest, 8); var bVec = new Vector(256.0f / 255.0f); var magicFloat = new Vector(32768.0f); @@ -128,9 +127,8 @@ namespace SixLabors.ImageSharp /// internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) { - GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows)); - - DebugGuard.IsTrue(ImageMaths.Modulo8(source.Length) == 0, nameof(source), "source.Length should be divisible by 8!"); + VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByteClampOverflows)); + VerifySpanInput(source, dest, 8); if (source.Length == 0) { @@ -168,9 +166,9 @@ namespace SixLabors.ImageSharp } /// - /// Convert 'source.Length' values normalized into [0..1] from 'source' + /// Convert all values normalized into [0..1] from 'source' /// into 'dest' buffer of . The values are scaled up into [0-255] and rounded. - /// The implementation is SIMD optimized and works only with `source.Length` divisible by 8. + /// This implementation is SIMD optimized and works only when span Length is divisible by 8. /// Based on: /// /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions @@ -178,12 +176,8 @@ namespace SixLabors.ImageSharp /// internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan source, Span dest) { - GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); - - DebugGuard.IsTrue( - ImageMaths.Modulo8(source.Length) == 0, - nameof(source), - "source.Length should be divisible by 8!"); + VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByte)); + VerifySpanInput(source, dest, 8); if (source.Length == 0) { diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index dfa6f189c..e0d6187dc 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -35,7 +35,7 @@ namespace SixLabors.ImageSharp ref ReadOnlySpan source, ref Span dest) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); if (!IsAvailable) { @@ -62,7 +62,7 @@ namespace SixLabors.ImageSharp ref ReadOnlySpan source, ref Span dest) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); if (!IsAvailable) { @@ -88,10 +88,7 @@ namespace SixLabors.ImageSharp /// internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { - DebugGuard.IsTrue( - ImageMaths.ModuloP2(dest.Length, Vector.Count) == 0, - nameof(source), - "dest.Length should be divisible by Vector.Count!"); + VerifySpanInput(source, dest, Vector.Count); int n = dest.Length / Vector.Count; @@ -126,10 +123,7 @@ namespace SixLabors.ImageSharp ReadOnlySpan source, Span dest) { - DebugGuard.IsTrue( - ImageMaths.ModuloP2(dest.Length, Vector.Count) == 0, - nameof(dest), - "dest.Length should be divisible by Vector.Count!"); + VerifySpanInput(source, dest, Vector.Count); int n = dest.Length / Vector.Count; diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs index ab18a0067..565ea08f5 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs @@ -26,7 +26,7 @@ namespace SixLabors.ImageSharp ref ReadOnlySpan source, ref Span dest) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); int remainder = ImageMaths.Modulo4(source.Length); int adjustedCount = source.Length - remainder; @@ -50,7 +50,7 @@ namespace SixLabors.ImageSharp ref ReadOnlySpan source, ref Span dest) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); int remainder = ImageMaths.Modulo4(source.Length); int adjustedCount = source.Length - remainder; @@ -72,10 +72,7 @@ namespace SixLabors.ImageSharp [MethodImpl(InliningOptions.ColdPath)] internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { - DebugGuard.IsTrue( - ImageMaths.Modulo4(dest.Length) == 0, - nameof(dest), - "dest.Length should be divisible by 4!"); + VerifySpanInput(source, dest, 4); int count = dest.Length / 4; if (count == 0) @@ -109,10 +106,7 @@ namespace SixLabors.ImageSharp ReadOnlySpan source, Span dest) { - DebugGuard.IsTrue( - ImageMaths.Modulo4(source.Length) == 0, - nameof(source), - "source.Length should be divisible by 4!"); + VerifySpanInput(source, dest, 4); int count = source.Length / 4; if (count == 0) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index 95a6030fd..fade8da79 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -154,12 +154,32 @@ namespace SixLabors.ImageSharp private static byte ConvertToByte(float f) => (byte)ImageMaths.Clamp((f * 255f) + 0.5f, 0, 255f); [Conditional("DEBUG")] - private static void GuardAvx2(string operation) + private static void VerifyIsAvx2Compatible(string operation) { if (!IsAvx2CompatibleArchitecture) { throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!"); } } + + [Conditional("DEBUG")] + private static void VerifySpanInput(ReadOnlySpan source, Span dest, int shouldBeDivisibleBy) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); + DebugGuard.IsTrue( + ImageMaths.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0, + nameof(source), + $"length should be divisable by {shouldBeDivisibleBy}!"); + } + + [Conditional("DEBUG")] + private static void VerifySpanInput(ReadOnlySpan source, Span dest, int shouldBeDivisibleBy) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); + DebugGuard.IsTrue( + ImageMaths.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0, + nameof(source), + $"length should be divisable by {shouldBeDivisibleBy}!"); + } } } \ No newline at end of file diff --git a/src/ImageSharp/Common/Tuples/Octet.cs b/src/ImageSharp/Common/Tuples/Octet.cs index ae01a3121..539b74e32 100644 --- a/src/ImageSharp/Common/Tuples/Octet.cs +++ b/src/ImageSharp/Common/Tuples/Octet.cs @@ -3,8 +3,14 @@ using System.Runtime.InteropServices; namespace SixLabors.ImageSharp.Tuples { + /// + /// Contains 8 element value tuples of various types. + /// internal static class Octet { + /// + /// Value tuple of -s + /// [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))] public struct OfUInt32 { @@ -34,7 +40,7 @@ namespace SixLabors.ImageSharp.Tuples public override string ToString() { - return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; + return $"{nameof(Octet)}.{nameof(OfUInt32)}({this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7})"; } [MethodImpl(InliningOptions.ShortMethod)] @@ -51,6 +57,9 @@ namespace SixLabors.ImageSharp.Tuples } } + /// + /// Value tuple of -s + /// [StructLayout(LayoutKind.Explicit, Size = 8)] public struct OfByte { @@ -80,7 +89,7 @@ namespace SixLabors.ImageSharp.Tuples public override string ToString() { - return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]"; + return $"{nameof(Octet)}.{nameof(OfByte)}({this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7})"; } [MethodImpl(InliningOptions.ShortMethod)] diff --git a/src/ImageSharp/Common/Tuples/Vector4Pair.cs b/src/ImageSharp/Common/Tuples/Vector4Pair.cs index 5988b2200..cae283d62 100644 --- a/src/ImageSharp/Common/Tuples/Vector4Pair.cs +++ b/src/ImageSharp/Common/Tuples/Vector4Pair.cs @@ -7,6 +7,7 @@ namespace SixLabors.ImageSharp.Tuples /// /// Its faster to process multiple Vector4-s together, so let's pair them! /// On AVX2 this pair should be convertible to of ! + /// TODO: Investigate defining this as union with an Octet.OfSingle type. /// [StructLayout(LayoutKind.Sequential)] internal struct Vector4Pair @@ -15,8 +16,6 @@ namespace SixLabors.ImageSharp.Tuples public Vector4 B; - private static readonly Vector4 Scale = new Vector4(1 / 255f); - [MethodImpl(MethodImplOptions.AggressiveInlining)] public void MultiplyInplace(float value) { @@ -52,8 +51,9 @@ namespace SixLabors.ImageSharp.Tuples b = b.FastRound(); // Downscale by 1/255 - this.A *= Scale; - this.B *= Scale; + var scale = new Vector4(1 / 255f); + this.A *= scale; + this.B *= scale; } /// @@ -74,7 +74,7 @@ namespace SixLabors.ImageSharp.Tuples public override string ToString() { - return $"{this.A}, {this.B}"; + return $"{nameof(Vector4Pair)}({this.A}, {this.B})"; } } } \ No newline at end of file diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs index 5c63a478d..1dc72aaf5 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs @@ -109,7 +109,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order: ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); - destination.Collect(ref r, ref g, ref b); + destination.Pack(ref r, ref g, ref b); } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs index 3f26cdc90..46644258b 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs @@ -102,7 +102,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order: ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i); - destination.Collect(ref rr, ref gg, ref bb); + destination.Pack(ref rr, ref gg, ref bb); } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index 293f3bc1f..456636dc3 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -157,9 +157,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters public Vector4 V0, V1, V2, V3, V4, V5, V6, V7; /// - /// Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order. + /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ... /// - public void Collect(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) + public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) { this.V0.X = r.A.X; this.V0.Y = g.A.X; diff --git a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs index cbf164a71..6c133191a 100644 --- a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs +++ b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs @@ -30,11 +30,10 @@ namespace SixLabors.ImageSharp.PixelFormats internal virtual void PackFromVector4(ReadOnlySpan sourceVectors, Span destinationColors, int count) { ReadOnlySpan sourceVectors1 = sourceVectors; - Span destinationColors1 = destinationColors; - GuardSpans(sourceVectors1, nameof(sourceVectors1), destinationColors1, nameof(destinationColors1), count); + GuardSpans(sourceVectors1, nameof(sourceVectors1), destinationColors, nameof(destinationColors), count); ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors1); - ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors1); + ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors); for (int i = 0; i < count; i++) { @@ -53,11 +52,10 @@ namespace SixLabors.ImageSharp.PixelFormats internal virtual void ToVector4(ReadOnlySpan sourceColors, Span destinationVectors, int count) { ReadOnlySpan sourceColors1 = sourceColors; - Span destinationVectors1 = destinationVectors; - GuardSpans(sourceColors1, nameof(sourceColors1), destinationVectors1, nameof(destinationVectors1), count); + GuardSpans(sourceColors1, nameof(sourceColors1), destinationVectors, nameof(destinationVectors), count); ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors1); - ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors1); + ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors); for (int i = 0; i < count; i++) { From 54ccf05794fb7a80c8a03572ab0f7ae5560d4714 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Mon, 22 Oct 2018 00:03:12 +0200 Subject: [PATCH 21/22] drop slow Clamp() implementation --- src/ImageSharp/Common/Helpers/ImageMaths.cs | 3 - src/ImageSharp/Common/Helpers/SimdUtils.cs | 6 +- .../General/BasicMath/ClampFloat.cs | 70 +++++++++++++++++++ .../{Clamp.cs => ClampInt32IntoByte.cs} | 2 +- .../Helpers/ImageMathsTests.cs | 2 +- 5 files changed, 75 insertions(+), 8 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/General/BasicMath/ClampFloat.cs rename tests/ImageSharp.Benchmarks/General/BasicMath/{Clamp.cs => ClampInt32IntoByte.cs} (98%) diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs index 1395975ec..02a2e9ee5 100644 --- a/src/ImageSharp/Common/Helpers/ImageMaths.cs +++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs @@ -61,9 +61,6 @@ namespace SixLabors.ImageSharp return x & (m - 1); } - [MethodImpl(InliningOptions.ShortMethod)] - public static float Clamp(float x, float min, float max) => Math.Min(max, Math.Max(min, x)); - /// /// Returns the absolute value of a 32-bit signed integer. Uses bit shifting to speed up the operation. /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index fade8da79..737e62006 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -65,7 +65,7 @@ namespace SixLabors.ImageSharp [MethodImpl(InliningOptions.ShortMethod)] internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); #if NETCOREAPP2_1 ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest); @@ -92,7 +92,7 @@ namespace SixLabors.ImageSharp [MethodImpl(InliningOptions.ShortMethod)] internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!"); + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); #if NETCOREAPP2_1 ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); @@ -151,7 +151,7 @@ namespace SixLabors.ImageSharp } [MethodImpl(InliningOptions.ShortMethod)] - private static byte ConvertToByte(float f) => (byte)ImageMaths.Clamp((f * 255f) + 0.5f, 0, 255f); + private static byte ConvertToByte(float f) => (byte)ComparableExtensions.Clamp((f * 255f) + 0.5f, 0, 255f); [Conditional("DEBUG")] private static void VerifyIsAvx2Compatible(string operation) diff --git a/tests/ImageSharp.Benchmarks/General/BasicMath/ClampFloat.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/ClampFloat.cs new file mode 100644 index 000000000..3b7dea095 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/General/BasicMath/ClampFloat.cs @@ -0,0 +1,70 @@ +using System; +using System.Runtime.CompilerServices; + +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Running; + +namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath +{ + public class ClampFloat + { + private readonly float min = -1.5f; + private readonly float max = 2.5f; + private static readonly float[] Values = { -10, -5, -3, -1.5f, -0.5f, 0f, 1f, 1.5f, 2.5f, 3, 10 }; + + [Benchmark(Baseline = true)] + public float UsingMathF() + { + float acc = 0; + + for (int i = 0; i < Values.Length; i++) + { + acc += ClampUsingMathF(Values[i], this.min, this.max); + } + + return acc; + } + + [Benchmark] + public float UsingBranching() + { + float acc = 0; + + for (int i = 0; i < Values.Length; i++) + { + acc += ClampUsingBranching(Values[i], this.min, this.max); + } + + return acc; + } + + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static float ClampUsingMathF(float x, float min, float max) + { + return Math.Min(max, Math.Max(min, x)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static float ClampUsingBranching(float x, float min, float max) + { + if (x >= max) + { + return max; + } + + if (x <= min) + { + return min; + } + + return x; + } + + // RESULTS: + // Method | Mean | Error | StdDev | Scaled | + // --------------- |---------:|----------:|----------:|-------:| + // UsingMathF | 30.37 ns | 0.3764 ns | 0.3337 ns | 1.00 | + // UsingBranching | 18.66 ns | 0.1043 ns | 0.0871 ns | 0.61 | + } +} \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/General/BasicMath/Clamp.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/ClampInt32IntoByte.cs similarity index 98% rename from tests/ImageSharp.Benchmarks/General/BasicMath/Clamp.cs rename to tests/ImageSharp.Benchmarks/General/BasicMath/ClampInt32IntoByte.cs index d486cb2f3..6ce82ba11 100644 --- a/tests/ImageSharp.Benchmarks/General/BasicMath/Clamp.cs +++ b/tests/ImageSharp.Benchmarks/General/BasicMath/ClampInt32IntoByte.cs @@ -10,7 +10,7 @@ using BenchmarkDotNet.Attributes; namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath { - public class Clamp + public class ClampInt32IntoByte { [Params(-1, 0, 255, 256)] public int Value { get; set; } diff --git a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs index d8b1525be..75ef611a5 100644 --- a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs +++ b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs @@ -73,7 +73,7 @@ namespace SixLabors.ImageSharp.Tests.Helpers [InlineData(10.1f, -0.1f, 10, 10f)] public void Clamp(float x, float min, float max, float expected) { - float actual = ImageMaths.Clamp(x, min, max); + float actual = x.Clamp(min, max); Assert.Equal(expected, actual); } From 90c7153a6ebd8e4c8d0c24a0837d1f7aa7c340c2 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Mon, 22 Oct 2018 00:05:47 +0200 Subject: [PATCH 22/22] remove useless reassignment in PixelOperations{TPixel} --- src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs index 6c133191a..b12a2bfa5 100644 --- a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs +++ b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs @@ -29,10 +29,9 @@ namespace SixLabors.ImageSharp.PixelFormats /// The number of pixels to convert. internal virtual void PackFromVector4(ReadOnlySpan sourceVectors, Span destinationColors, int count) { - ReadOnlySpan sourceVectors1 = sourceVectors; - GuardSpans(sourceVectors1, nameof(sourceVectors1), destinationColors, nameof(destinationColors), count); + GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count); - ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors1); + ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors); ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors); for (int i = 0; i < count; i++) @@ -51,10 +50,9 @@ namespace SixLabors.ImageSharp.PixelFormats /// The number of pixels to convert. internal virtual void ToVector4(ReadOnlySpan sourceColors, Span destinationVectors, int count) { - ReadOnlySpan sourceColors1 = sourceColors; - GuardSpans(sourceColors1, nameof(sourceColors1), destinationVectors, nameof(destinationVectors), count); + GuardSpans(sourceColors, nameof(sourceColors), destinationVectors, nameof(destinationVectors), count); - ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors1); + ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors); ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors); for (int i = 0; i < count; i++)