From 8efdbfb9de16d386aa5827bee1ebc0f6113408be Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sun, 1 Mar 2020 00:15:36 +0100 Subject: [PATCH] AVX float -> byte conversion --- .../Helpers/SimdUtils.Avx2Intrinsics.cs | 99 ++++ src/ImageSharp/Common/Helpers/SimdUtils.cs | 6 +- .../JpegColorConverter.FromYCbCrSimdAvx2.cs | 6 +- .../ColorConverters/JpegColorConverter.cs | 2 +- .../Jpeg/BlockOperations/Block8x8F_Round.cs | 442 +++++++++++++++++- .../Codecs/Jpeg/YCbCrColorConversion.cs | 10 +- .../Color/Bulk/FromVector4.cs | 122 +++-- tests/ImageSharp.Benchmarks/Config.cs | 8 + .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 19 + .../Formats/Jpg/JpegColorConverterTests.cs | 2 +- 10 files changed, 668 insertions(+), 48 deletions(-) create mode 100644 src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs new file mode 100644 index 000000000..85f73776c --- /dev/null +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs @@ -0,0 +1,99 @@ +// Copyright (c) Six Labors and contributors. +// Licensed under the Apache License, Version 2.0. + +#if SUPPORTS_RUNTIME_INTRINSICS + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + public static class Avx2Intrinsics + { + private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + + /// + /// as many elements as possible, slicing them down (keeping the remainder). + /// + [MethodImpl(InliningOptions.ShortMethod)] + internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce( + ref ReadOnlySpan source, + ref Span dest) + { + DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); + + if (Avx2.IsSupported) + { + int remainder = ImageMaths.ModuloP2(source.Length, Vector.Count); + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + BulkConvertNormalizedFloatToByteClampOverflows( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount)); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + } + + /// + /// Implementation of , which is faster on new .NET runtime. + /// + internal static void BulkConvertNormalizedFloatToByteClampOverflows( + ReadOnlySpan source, + Span dest) + { + VerifySpanInput(source, dest, Vector256.Count); + + int n = dest.Length / Vector256.Count; + + ref Vector256 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var maxBytes = Vector256.Create(255f); + ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); + Vector256 mask = Unsafe.As>(ref maskBase); + + for (int i = 0; i < n; i++) + { + ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4); + + Vector256 f0 = s; + Vector256 f1 = Unsafe.Add(ref s, 1); + Vector256 f2 = Unsafe.Add(ref s, 2); + Vector256 f3 = Unsafe.Add(ref s, 3); + + Vector256 w0 = ConvertToInt32(f0, maxBytes); + Vector256 w1 = ConvertToInt32(f1, maxBytes); + Vector256 w2 = ConvertToInt32(f2, maxBytes); + Vector256 w3 = ConvertToInt32(f3, maxBytes); + + Vector256 u0 = Avx2.PackSignedSaturate(w0, w1); + Vector256 u1 = Avx2.PackSignedSaturate(w2, w3); + Vector256 b = Avx2.PackUnsignedSaturate(u0, u1); + b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); + + Unsafe.Add(ref destBase, i) = b; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 ConvertToInt32(Vector256 vf, Vector256 scale) + { + vf = Avx.Multiply(vf, scale); + return Avx.ConvertToVector256Int32(vf); + } + } + } +} +#endif diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index 08c256f84..de313c8d5 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -92,11 +92,15 @@ namespace SixLabors.ImageSharp { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); -#if SUPPORTS_EXTENDED_INTRINSICS +#if SUPPORTS_RUNTIME_INTRINSICS + Avx2Intrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); +#elif SUPPORTS_EXTENDED_INTRINSICS ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); #else BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); #endif + + // Also deals with the remainder from previous conversions: FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest); // Deal with the remainder: diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs index 1165db280..8c1b427ee 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs @@ -13,9 +13,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters { internal abstract partial class JpegColorConverter { - internal sealed class FromYCbCrSimdAvx2 : JpegColorConverter + internal sealed class FromYCbCrSimdVector8 : JpegColorConverter { - public FromYCbCrSimdAvx2(int precision) + public FromYCbCrSimdVector8(int precision) : base(JpegColorSpace.YCbCr, precision) { } @@ -107,4 +107,4 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters } } } -} \ No newline at end of file +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs index 44314759c..7ada1b9da 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs @@ -93,7 +93,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters /// Returns the for the YCbCr colorspace that matches the current CPU architecture. /// private static JpegColorConverter GetYCbCrConverter(int precision) => - FromYCbCrSimdAvx2.IsAvailable ? (JpegColorConverter)new FromYCbCrSimdAvx2(precision) : new FromYCbCrSimd(precision); + FromYCbCrSimdVector8.IsAvailable ? (JpegColorConverter)new FromYCbCrSimdVector8(precision) : new FromYCbCrSimd(precision); /// /// A stack-only struct to reference the input buffers using -s. diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Round.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Round.cs index bf6ea3dac..32d838f8c 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Round.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Round.cs @@ -4,6 +4,12 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using BenchmarkDotNet.Attributes; @@ -12,10 +18,14 @@ using SixLabors.ImageSharp.Formats.Jpeg.Components; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations { - public class Block8x8F_Round + public unsafe class Block8x8F_Round { private Block8x8F block; + private readonly byte[] blockBuffer = new byte[512]; + private GCHandle blockHandle; + private float* alignedPtr; + [GlobalSetup] public void Setup() { @@ -24,13 +34,27 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations throw new NotSupportedException("Vector.Count != 8"); } - for (int i = 0; i < Block8x8F.Size; i++) + this.blockHandle = GCHandle.Alloc(this.blockBuffer, GCHandleType.Pinned); + ulong ptr = (ulong)this.blockHandle.AddrOfPinnedObject(); + ptr += 16; + ptr -= ptr % 16; + + if (ptr % 16 != 0) { - this.block[i] = i * 44.8f; + throw new Exception("ptr is unaligned"); } + + this.alignedPtr = (float*)ptr; } - [Benchmark(Baseline = true)] + [GlobalCleanup] + public void Cleanup() + { + this.blockHandle.Free(); + this.alignedPtr = null; + } + + [Benchmark] public void ScalarRound() { ref float b = ref Unsafe.As(ref this.block); @@ -42,8 +66,8 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations } } - [Benchmark] - public void SimdRound() + [Benchmark(Baseline = true)] + public void SimdUtils_FastRound_Vector8() { ref Block8x8F b = ref this.block; @@ -64,5 +88,411 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations ref Vector row7 = ref Unsafe.As>(ref b.V7L); row7 = SimdUtils.FastRound(row7); } + + [Benchmark] + public void SimdUtils_FastRound_Vector8_ForceAligned() + { + ref Block8x8F b = ref Unsafe.AsRef(this.alignedPtr); + + ref Vector row0 = ref Unsafe.As>(ref b.V0L); + row0 = SimdUtils.FastRound(row0); + ref Vector row1 = ref Unsafe.As>(ref b.V1L); + row1 = SimdUtils.FastRound(row1); + ref Vector row2 = ref Unsafe.As>(ref b.V2L); + row2 = SimdUtils.FastRound(row2); + ref Vector row3 = ref Unsafe.As>(ref b.V3L); + row3 = SimdUtils.FastRound(row3); + ref Vector row4 = ref Unsafe.As>(ref b.V4L); + row4 = SimdUtils.FastRound(row4); + ref Vector row5 = ref Unsafe.As>(ref b.V5L); + row5 = SimdUtils.FastRound(row5); + ref Vector row6 = ref Unsafe.As>(ref b.V6L); + row6 = SimdUtils.FastRound(row6); + ref Vector row7 = ref Unsafe.As>(ref b.V7L); + row7 = SimdUtils.FastRound(row7); + } + + [Benchmark] + public void SimdUtils_FastRound_Vector8_Grouped() + { + ref Block8x8F b = ref this.block; + + ref Vector row0 = ref Unsafe.As>(ref b.V0L); + ref Vector row1 = ref Unsafe.As>(ref b.V1L); + ref Vector row2 = ref Unsafe.As>(ref b.V2L); + ref Vector row3 = ref Unsafe.As>(ref b.V3L); + + row0 = SimdUtils.FastRound(row0); + row1 = SimdUtils.FastRound(row1); + row2 = SimdUtils.FastRound(row2); + row3 = SimdUtils.FastRound(row3); + + row0 = ref Unsafe.As>(ref b.V4L); + row1 = ref Unsafe.As>(ref b.V5L); + row2 = ref Unsafe.As>(ref b.V6L); + row3 = ref Unsafe.As>(ref b.V7L); + + row0 = SimdUtils.FastRound(row0); + row1 = SimdUtils.FastRound(row1); + row2 = SimdUtils.FastRound(row2); + row3 = SimdUtils.FastRound(row3); + } + +#if SUPPORTS_RUNTIME_INTRINSICS + [Benchmark] + public void Sse41_V1() + { + ref Vector128 b0 = ref Unsafe.As>(ref this.block); + + ref Vector128 p = ref b0; + p = Sse41.RoundToNearestInteger(p); + + p = ref Unsafe.Add(ref b0, 1); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 2); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 3); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 4); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 5); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 6); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 7); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 8); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 9); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 10); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 11); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 12); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 13); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 14); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.Add(ref b0, 15); + p = Sse41.RoundToNearestInteger(p); + } + + [Benchmark] + public unsafe void Sse41_V2() + { + ref Vector128 p = ref Unsafe.As>(ref this.block); + p = Sse41.RoundToNearestInteger(p); + var offset = (IntPtr)sizeof(Vector128); + p = Sse41.RoundToNearestInteger(p); + + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + } + + [Benchmark] + public unsafe void Sse41_V3() + { + ref Vector128 p = ref Unsafe.As>(ref this.block); + p = Sse41.RoundToNearestInteger(p); + var offset = (IntPtr)sizeof(Vector128); + + for (int i = 0; i < 15; i++) + { + p = ref Unsafe.AddByteOffset(ref p, offset); + p = Sse41.RoundToNearestInteger(p); + } + } + + [Benchmark] + public unsafe void Sse41_V4() + { + ref Vector128 p = ref Unsafe.As>(ref this.block); + var offset = (IntPtr)sizeof(Vector128); + + ref Vector128 a = ref p; + ref Vector128 b = ref Unsafe.AddByteOffset(ref a, offset); + ref Vector128 c = ref Unsafe.AddByteOffset(ref b, offset); + ref Vector128 d = ref Unsafe.AddByteOffset(ref c, offset); + a = Sse41.RoundToNearestInteger(a); + b = Sse41.RoundToNearestInteger(b); + c = Sse41.RoundToNearestInteger(c); + d = Sse41.RoundToNearestInteger(d); + + a = ref Unsafe.AddByteOffset(ref d, offset); + b = ref Unsafe.AddByteOffset(ref a, offset); + c = ref Unsafe.AddByteOffset(ref b, offset); + d = ref Unsafe.AddByteOffset(ref c, offset); + a = Sse41.RoundToNearestInteger(a); + b = Sse41.RoundToNearestInteger(b); + c = Sse41.RoundToNearestInteger(c); + d = Sse41.RoundToNearestInteger(d); + + a = ref Unsafe.AddByteOffset(ref d, offset); + b = ref Unsafe.AddByteOffset(ref a, offset); + c = ref Unsafe.AddByteOffset(ref b, offset); + d = ref Unsafe.AddByteOffset(ref c, offset); + a = Sse41.RoundToNearestInteger(a); + b = Sse41.RoundToNearestInteger(b); + c = Sse41.RoundToNearestInteger(c); + d = Sse41.RoundToNearestInteger(d); + + a = ref Unsafe.AddByteOffset(ref d, offset); + b = ref Unsafe.AddByteOffset(ref a, offset); + c = ref Unsafe.AddByteOffset(ref b, offset); + d = ref Unsafe.AddByteOffset(ref c, offset); + a = Sse41.RoundToNearestInteger(a); + b = Sse41.RoundToNearestInteger(b); + c = Sse41.RoundToNearestInteger(c); + d = Sse41.RoundToNearestInteger(d); + } + + [Benchmark] + public unsafe void Sse41_V5_Unaligned() + { + float* p = this.alignedPtr + 1; + + Vector128 v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + p += 8; + + v = Sse.LoadVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.Store(p, v); + } + + [Benchmark] + public unsafe void Sse41_V5_Aligned() + { + float* p = this.alignedPtr; + + Vector128 v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + + v = Sse.LoadAlignedVector128(p); + v = Sse41.RoundToNearestInteger(v); + Sse.StoreAligned(p, v); + p += 8; + } + + [Benchmark] + public void Sse41_V6_Aligned() + { + float* p = this.alignedPtr; + + Round8SseVectors(p); + Round8SseVectors(p + 32); + } + + private static void Round8SseVectors(float* p0) + { + float* p1 = p0 + 4; + float* p2 = p1 + 4; + float* p3 = p2 + 4; + float* p4 = p3 + 4; + float* p5 = p4 + 4; + float* p6 = p5 + 4; + float* p7 = p6 + 4; + + Vector128 v0 = Sse.LoadAlignedVector128(p0); + Vector128 v1 = Sse.LoadAlignedVector128(p1); + Vector128 v2 = Sse.LoadAlignedVector128(p2); + Vector128 v3 = Sse.LoadAlignedVector128(p3); + Vector128 v4 = Sse.LoadAlignedVector128(p4); + Vector128 v5 = Sse.LoadAlignedVector128(p5); + Vector128 v6 = Sse.LoadAlignedVector128(p6); + Vector128 v7 = Sse.LoadAlignedVector128(p7); + + v0 = Sse41.RoundToNearestInteger(v0); + v1 = Sse41.RoundToNearestInteger(v1); + v2 = Sse41.RoundToNearestInteger(v2); + v3 = Sse41.RoundToNearestInteger(v3); + v4 = Sse41.RoundToNearestInteger(v4); + v5 = Sse41.RoundToNearestInteger(v5); + v6 = Sse41.RoundToNearestInteger(v6); + v7 = Sse41.RoundToNearestInteger(v7); + + Sse.StoreAligned(p0, v0); + Sse.StoreAligned(p1, v1); + Sse.StoreAligned(p2, v2); + Sse.StoreAligned(p3, v3); + Sse.StoreAligned(p4, v4); + Sse.StoreAligned(p5, v5); + Sse.StoreAligned(p6, v6); + Sse.StoreAligned(p7, v7); + } +#endif } } diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/YCbCrColorConversion.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/YCbCrColorConversion.cs index 1e4ebb719..f8e7f7fb8 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/YCbCrColorConversion.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/YCbCrColorConversion.cs @@ -11,7 +11,7 @@ using SixLabors.ImageSharp.Memory; namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg { - [Config(typeof(Config.ShortClr))] + [Config(typeof(Config.ShortCore31))] public class YCbCrColorConversion { private Buffer2D[] input; @@ -36,7 +36,7 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg } } - [Benchmark(Baseline = true)] + [Benchmark] public void Scalar() { var values = new JpegColorConverter.ComponentValues(this.input, 0); @@ -44,7 +44,7 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg JpegColorConverter.FromYCbCrBasic.ConvertCore(values, this.output, 255F, 128F); } - [Benchmark] + [Benchmark(Baseline = true)] public void SimdVector4() { var values = new JpegColorConverter.ComponentValues(this.input, 0); @@ -53,11 +53,11 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg } [Benchmark] - public void SimdAvx2() + public void SimdVector8() { var values = new JpegColorConverter.ComponentValues(this.input, 0); - JpegColorConverter.FromYCbCrSimdAvx2.ConvertCore(values, this.output, 255F, 128F); + JpegColorConverter.FromYCbCrSimdVector8.ConvertCore(values, this.output, 255F, 128F); } private static Buffer2D[] CreateRandomValues( diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs index f2af362e0..04043c2f7 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs @@ -7,15 +7,21 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using BenchmarkDotNet.Attributes; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Environments; +using BenchmarkDotNet.Jobs; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk { - [Config(typeof(Config.ShortClr))] + [Config(typeof(Config.ShortCore31))] public abstract class FromVector4 where TPixel : unmanaged, IPixel { @@ -25,7 +31,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk protected Configuration Configuration => Configuration.Default; - [Params(64, 2048)] + // [Params(64, 2048)] + [Params(1024)] public int Count { get; set; } [GlobalSetup] @@ -77,7 +84,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); } - [Benchmark(Baseline = true)] + [Benchmark] public void BasicIntrinsics256() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -86,7 +93,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); } - [Benchmark] + [Benchmark(Baseline = true)] public void ExtendedIntrinsic() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -95,31 +102,84 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); } - // RESULTS (2018 October): - // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated | - // ---------------------------- |-------- |------ |-------------:|-------------:|------------:|-------:|---------:|-------:|----------:| - // FallbackIntrinsics128 | Clr | 64 | 340.38 ns | 22.319 ns | 1.2611 ns | 1.41 | 0.01 | - | 0 B | - // BasicIntrinsics256 | Clr | 64 | 240.79 ns | 11.421 ns | 0.6453 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsic | Clr | 64 | 199.09 ns | 124.239 ns | 7.0198 ns | 0.83 | 0.02 | - | 0 B | - // PixelOperations_Base | Clr | 64 | 647.99 ns | 24.003 ns | 1.3562 ns | 2.69 | 0.01 | 0.0067 | 24 B | - // PixelOperations_Specialized | Clr | 64 | 259.79 ns | 13.391 ns | 0.7566 ns | 1.08 | 0.00 | - | 0 B | <--- ceremonial overhead has been minimized! - // | | | | | | | | | | - // FallbackIntrinsics128 | Core | 64 | 234.64 ns | 12.320 ns | 0.6961 ns | 1.58 | 0.00 | - | 0 B | - // BasicIntrinsics256 | Core | 64 | 148.87 ns | 2.794 ns | 0.1579 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsic | Core | 64 | 94.06 ns | 10.015 ns | 0.5659 ns | 0.63 | 0.00 | - | 0 B | - // PixelOperations_Base | Core | 64 | 573.52 ns | 31.865 ns | 1.8004 ns | 3.85 | 0.01 | 0.0067 | 24 B | - // PixelOperations_Specialized | Core | 64 | 117.21 ns | 13.264 ns | 0.7494 ns | 0.79 | 0.00 | - | 0 B | - // | | | | | | | | | | - // FallbackIntrinsics128 | Clr | 2048 | 6,735.93 ns | 2,139.340 ns | 120.8767 ns | 1.71 | 0.03 | - | 0 B | - // BasicIntrinsics256 | Clr | 2048 | 3,929.29 ns | 334.027 ns | 18.8731 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsic | Clr | 2048 | 2,226.01 ns | 130.525 ns | 7.3749 ns |!! 0.57 | 0.00 | - | 0 B | <--- ExtendedIntrinsics rock! - // PixelOperations_Base | Clr | 2048 | 16,760.84 ns | 367.800 ns | 20.7814 ns | 4.27 | 0.02 | - | 24 B | <--- Extra copies using "Vector4 TPixel.ToVector4()" - // PixelOperations_Specialized | Clr | 2048 | 3,986.03 ns | 237.238 ns | 13.4044 ns | 1.01 | 0.00 | - | 0 B | <--- can't yet detect whether ExtendedIntrinsics are available :( - // | | | | | | | | | | - // FallbackIntrinsics128 | Core | 2048 | 6,644.65 ns | 2,677.090 ns | 151.2605 ns | 1.69 | 0.05 | - | 0 B | - // BasicIntrinsics256 | Core | 2048 | 3,923.70 ns | 1,971.760 ns | 111.4081 ns | 1.00 | 0.00 | - | 0 B | - // ExtendedIntrinsic | Core | 2048 | 2,092.32 ns | 375.657 ns | 21.2253 ns |!! 0.53 | 0.01 | - | 0 B | <--- ExtendedIntrinsics rock! - // PixelOperations_Base | Core | 2048 | 16,875.73 ns | 1,271.957 ns | 71.8679 ns | 4.30 | 0.10 | - | 24 B | - // PixelOperations_Specialized | Core | 2048 | 2,129.92 ns | 262.888 ns | 14.8537 ns |!! 0.54 | 0.01 | - | 0 B | <--- ExtendedIntrinsics rock! +#if SUPPORTS_RUNTIME_INTRINSICS + [Benchmark] + public void UseAvx2() + { + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + + SimdUtils.Avx2Intrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); + } + + private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; + + [Benchmark] + public void UseAvx2_Grouped() + { + Span src = MemoryMarshal.Cast(this.source.GetSpan()); + Span dest = MemoryMarshal.Cast(this.destination.GetSpan()); + + int n = dest.Length / Vector.Count; + + ref Vector256 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(src)); + ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); + Vector256 mask = Unsafe.As>(ref maskBase); + + var maxBytes = Vector256.Create(255f); + + for (int i = 0; i < n; i++) + { + ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4); + + Vector256 f0 = s; + Vector256 f1 = Unsafe.Add(ref s, 1); + Vector256 f2 = Unsafe.Add(ref s, 2); + Vector256 f3 = Unsafe.Add(ref s, 3); + + f0 = Avx.Multiply(maxBytes, f0); + f1 = Avx.Multiply(maxBytes, f1); + f2 = Avx.Multiply(maxBytes, f2); + f3 = Avx.Multiply(maxBytes, f3); + + Vector256 w0 = Avx.ConvertToVector256Int32(f0); + Vector256 w1 = Avx.ConvertToVector256Int32(f1); + Vector256 w2 = Avx.ConvertToVector256Int32(f2); + Vector256 w3 = Avx.ConvertToVector256Int32(f3); + + Vector256 u0 = Avx2.PackSignedSaturate(w0, w1); + Vector256 u1 = Avx2.PackSignedSaturate(w2, w3); + Vector256 b = Avx2.PackUnsignedSaturate(u0, u1); + b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); + + Unsafe.Add(ref destBase, i) = b; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 ConvertToInt32(Vector256 vf, Vector256 scale) + { + vf = Avx.Multiply(scale, vf); + return Avx.ConvertToVector256Int32(vf); + } +#endif + + // *** RESULTS 2020 March: *** + // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK=3.1.200-preview-014971 + // Job-IUZXZT : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT + // + // | Method | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |---------------------------- |------ |-----------:|------------:|----------:|------:|--------:|------:|------:|------:|----------:| + // | FallbackIntrinsics128 | 1024 | 2,952.6 ns | 1,680.77 ns | 92.13 ns | 3.32 | 0.16 | - | - | - | - | + // | BasicIntrinsics256 | 1024 | 1,664.5 ns | 928.11 ns | 50.87 ns | 1.87 | 0.09 | - | - | - | - | + // | ExtendedIntrinsic | 1024 | 890.6 ns | 375.48 ns | 20.58 ns | 1.00 | 0.00 | - | - | - | - | + // | UseAvx2 | 1024 | 299.0 ns | 30.47 ns | 1.67 ns | 0.34 | 0.01 | - | - | - | - | + // | UseAvx2_Grouped | 1024 | 318.1 ns | 48.19 ns | 2.64 ns | 0.36 | 0.01 | - | - | - | - | + // | PixelOperations_Base | 1024 | 8,136.9 ns | 1,834.82 ns | 100.57 ns | 9.14 | 0.26 | - | - | - | 24 B | + // | PixelOperations_Specialized | 1024 | 951.1 ns | 123.93 ns | 6.79 ns | 1.07 | 0.03 | - | - | - | - | } } diff --git a/tests/ImageSharp.Benchmarks/Config.cs b/tests/ImageSharp.Benchmarks/Config.cs index fd0b213b3..fda98a097 100644 --- a/tests/ImageSharp.Benchmarks/Config.cs +++ b/tests/ImageSharp.Benchmarks/Config.cs @@ -38,6 +38,14 @@ namespace SixLabors.ImageSharp.Benchmarks } } + public class ShortCore31 : Config + { + public ShortCore31() + { + this.Add(Job.Default.With(CoreRuntime.Core31).WithLaunchCount(1).WithWarmupCount(3).WithIterationCount(3)); + } + } + #if Windows_NT private bool IsElevated { diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index cc5519c79..1ab854c00 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -6,6 +6,7 @@ using System.Linq; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; using SixLabors.ImageSharp.Common.Tuples; using Xunit; @@ -277,6 +278,24 @@ namespace SixLabors.ImageSharp.Tests.Common Assert.Equal(expected2, actual2); } +#if SUPPORTS_RUNTIME_INTRINSICS + + [Theory] + [MemberData(nameof(ArraySizesDivisibleBy32))] + public void Avx2_BulkConvertNormalizedFloatToByteClampOverflows(int count) + { + if (!System.Runtime.Intrinsics.X86.Avx2.IsSupported) + { + return; + } + + TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( + count, + (s, d) => SimdUtils.Avx2Intrinsics.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)); + } + +#endif + [Theory] [MemberData(nameof(ArbitraryArraySizes))] public void BulkConvertNormalizedFloatToByteClampOverflows(int count) diff --git a/tests/ImageSharp.Tests/Formats/Jpg/JpegColorConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/JpegColorConverterTests.cs index 7f0033cc5..27c612aee 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/JpegColorConverterTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/JpegColorConverterTests.cs @@ -107,7 +107,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg // JpegColorConverter.FromYCbCrSimdAvx2.LogPlz = s => this.Output.WriteLine(s); ValidateRgbToYCbCrConversion( - new JpegColorConverter.FromYCbCrSimdAvx2(8), + new JpegColorConverter.FromYCbCrSimdVector8(8), 3, inputBufferLength, resultBufferLength,