Browse Source

AVX float -> byte conversion

pull/1574/head
Anton Firszov 6 years ago
parent
commit
8efdbfb9de
  1. 99
      src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
  2. 6
      src/ImageSharp/Common/Helpers/SimdUtils.cs
  3. 6
      src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
  4. 2
      src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
  5. 442
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Round.cs
  6. 10
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/YCbCrColorConversion.cs
  7. 122
      tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs
  8. 8
      tests/ImageSharp.Benchmarks/Config.cs
  9. 19
      tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
  10. 2
      tests/ImageSharp.Tests/Formats/Jpg/JpegColorConverterTests.cs

99
src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs

@ -0,0 +1,99 @@
// Copyright (c) Six Labors and contributors.
// Licensed under the Apache License, Version 2.0.
#if SUPPORTS_RUNTIME_INTRINSICS
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp
{
internal static partial class SimdUtils
{
public static class Avx2Intrinsics
{
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
/// <summary>
/// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
ref ReadOnlySpan<float> source,
ref Span<byte> dest)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
if (Avx2.IsSupported)
{
int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count);
int adjustedCount = source.Length - remainder;
if (adjustedCount > 0)
{
BulkConvertNormalizedFloatToByteClampOverflows(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount));
source = source.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);
}
}
}
/// <summary>
/// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
/// </summary>
internal static void BulkConvertNormalizedFloatToByteClampOverflows(
ReadOnlySpan<float> source,
Span<byte> dest)
{
VerifySpanInput(source, dest, Vector256<byte>.Count);
int n = dest.Length / Vector256<byte>.Count;
ref Vector256<float> sourceBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
var maxBytes = Vector256.Create(255f);
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);
for (int i = 0; i < n; i++)
{
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
Vector256<float> f0 = s;
Vector256<float> f1 = Unsafe.Add(ref s, 1);
Vector256<float> f2 = Unsafe.Add(ref s, 2);
Vector256<float> f3 = Unsafe.Add(ref s, 3);
Vector256<int> w0 = ConvertToInt32(f0, maxBytes);
Vector256<int> w1 = ConvertToInt32(f1, maxBytes);
Vector256<int> w2 = ConvertToInt32(f2, maxBytes);
Vector256<int> w3 = ConvertToInt32(f3, maxBytes);
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
Unsafe.Add(ref destBase, i) = b;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale)
{
vf = Avx.Multiply(vf, scale);
return Avx.ConvertToVector256Int32(vf);
}
}
}
}
#endif

6
src/ImageSharp/Common/Helpers/SimdUtils.cs

@ -92,11 +92,15 @@ namespace SixLabors.ImageSharp
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
#if SUPPORTS_EXTENDED_INTRINSICS
#if SUPPORTS_RUNTIME_INTRINSICS
Avx2Intrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
#elif SUPPORTS_EXTENDED_INTRINSICS
ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
#else
BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
#endif
// Also deals with the remainder from previous conversions:
FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
// Deal with the remainder:

6
src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs

@ -13,9 +13,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
{
internal abstract partial class JpegColorConverter
{
internal sealed class FromYCbCrSimdAvx2 : JpegColorConverter
internal sealed class FromYCbCrSimdVector8 : JpegColorConverter
{
public FromYCbCrSimdAvx2(int precision)
public FromYCbCrSimdVector8(int precision)
: base(JpegColorSpace.YCbCr, precision)
{
}
@ -107,4 +107,4 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
}
}
}
}
}

2
src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs

@ -93,7 +93,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
/// Returns the <see cref="JpegColorConverter"/> for the YCbCr colorspace that matches the current CPU architecture.
/// </summary>
private static JpegColorConverter GetYCbCrConverter(int precision) =>
FromYCbCrSimdAvx2.IsAvailable ? (JpegColorConverter)new FromYCbCrSimdAvx2(precision) : new FromYCbCrSimd(precision);
FromYCbCrSimdVector8.IsAvailable ? (JpegColorConverter)new FromYCbCrSimdVector8(precision) : new FromYCbCrSimd(precision);
/// <summary>
/// A stack-only struct to reference the input buffers using <see cref="ReadOnlySpan{T}"/>-s.

442
tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Round.cs

@ -4,6 +4,12 @@
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
using BenchmarkDotNet.Attributes;
@ -12,10 +18,14 @@ using SixLabors.ImageSharp.Formats.Jpeg.Components;
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
{
public class Block8x8F_Round
public unsafe class Block8x8F_Round
{
private Block8x8F block;
private readonly byte[] blockBuffer = new byte[512];
private GCHandle blockHandle;
private float* alignedPtr;
[GlobalSetup]
public void Setup()
{
@ -24,13 +34,27 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
throw new NotSupportedException("Vector<float>.Count != 8");
}
for (int i = 0; i < Block8x8F.Size; i++)
this.blockHandle = GCHandle.Alloc(this.blockBuffer, GCHandleType.Pinned);
ulong ptr = (ulong)this.blockHandle.AddrOfPinnedObject();
ptr += 16;
ptr -= ptr % 16;
if (ptr % 16 != 0)
{
this.block[i] = i * 44.8f;
throw new Exception("ptr is unaligned");
}
this.alignedPtr = (float*)ptr;
}
[Benchmark(Baseline = true)]
[GlobalCleanup]
public void Cleanup()
{
this.blockHandle.Free();
this.alignedPtr = null;
}
[Benchmark]
public void ScalarRound()
{
ref float b = ref Unsafe.As<Block8x8F, float>(ref this.block);
@ -42,8 +66,8 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
}
}
[Benchmark]
public void SimdRound()
[Benchmark(Baseline = true)]
public void SimdUtils_FastRound_Vector8()
{
ref Block8x8F b = ref this.block;
@ -64,5 +88,411 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
ref Vector<float> row7 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V7L);
row7 = SimdUtils.FastRound(row7);
}
[Benchmark]
public void SimdUtils_FastRound_Vector8_ForceAligned()
{
ref Block8x8F b = ref Unsafe.AsRef<Block8x8F>(this.alignedPtr);
ref Vector<float> row0 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V0L);
row0 = SimdUtils.FastRound(row0);
ref Vector<float> row1 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V1L);
row1 = SimdUtils.FastRound(row1);
ref Vector<float> row2 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V2L);
row2 = SimdUtils.FastRound(row2);
ref Vector<float> row3 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V3L);
row3 = SimdUtils.FastRound(row3);
ref Vector<float> row4 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V4L);
row4 = SimdUtils.FastRound(row4);
ref Vector<float> row5 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V5L);
row5 = SimdUtils.FastRound(row5);
ref Vector<float> row6 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V6L);
row6 = SimdUtils.FastRound(row6);
ref Vector<float> row7 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V7L);
row7 = SimdUtils.FastRound(row7);
}
[Benchmark]
public void SimdUtils_FastRound_Vector8_Grouped()
{
ref Block8x8F b = ref this.block;
ref Vector<float> row0 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V0L);
ref Vector<float> row1 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V1L);
ref Vector<float> row2 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V2L);
ref Vector<float> row3 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V3L);
row0 = SimdUtils.FastRound(row0);
row1 = SimdUtils.FastRound(row1);
row2 = SimdUtils.FastRound(row2);
row3 = SimdUtils.FastRound(row3);
row0 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V4L);
row1 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V5L);
row2 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V6L);
row3 = ref Unsafe.As<Vector4, Vector<float>>(ref b.V7L);
row0 = SimdUtils.FastRound(row0);
row1 = SimdUtils.FastRound(row1);
row2 = SimdUtils.FastRound(row2);
row3 = SimdUtils.FastRound(row3);
}
#if SUPPORTS_RUNTIME_INTRINSICS
[Benchmark]
public void Sse41_V1()
{
ref Vector128<float> b0 = ref Unsafe.As<Block8x8F, Vector128<float>>(ref this.block);
ref Vector128<float> p = ref b0;
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 1);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 2);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 3);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 4);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 5);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 6);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 7);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 8);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 9);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 10);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 11);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 12);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 13);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 14);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.Add(ref b0, 15);
p = Sse41.RoundToNearestInteger(p);
}
[Benchmark]
public unsafe void Sse41_V2()
{
ref Vector128<float> p = ref Unsafe.As<Block8x8F, Vector128<float>>(ref this.block);
p = Sse41.RoundToNearestInteger(p);
var offset = (IntPtr)sizeof(Vector128<float>);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
}
[Benchmark]
public unsafe void Sse41_V3()
{
ref Vector128<float> p = ref Unsafe.As<Block8x8F, Vector128<float>>(ref this.block);
p = Sse41.RoundToNearestInteger(p);
var offset = (IntPtr)sizeof(Vector128<float>);
for (int i = 0; i < 15; i++)
{
p = ref Unsafe.AddByteOffset(ref p, offset);
p = Sse41.RoundToNearestInteger(p);
}
}
[Benchmark]
public unsafe void Sse41_V4()
{
ref Vector128<float> p = ref Unsafe.As<Block8x8F, Vector128<float>>(ref this.block);
var offset = (IntPtr)sizeof(Vector128<float>);
ref Vector128<float> a = ref p;
ref Vector128<float> b = ref Unsafe.AddByteOffset(ref a, offset);
ref Vector128<float> c = ref Unsafe.AddByteOffset(ref b, offset);
ref Vector128<float> d = ref Unsafe.AddByteOffset(ref c, offset);
a = Sse41.RoundToNearestInteger(a);
b = Sse41.RoundToNearestInteger(b);
c = Sse41.RoundToNearestInteger(c);
d = Sse41.RoundToNearestInteger(d);
a = ref Unsafe.AddByteOffset(ref d, offset);
b = ref Unsafe.AddByteOffset(ref a, offset);
c = ref Unsafe.AddByteOffset(ref b, offset);
d = ref Unsafe.AddByteOffset(ref c, offset);
a = Sse41.RoundToNearestInteger(a);
b = Sse41.RoundToNearestInteger(b);
c = Sse41.RoundToNearestInteger(c);
d = Sse41.RoundToNearestInteger(d);
a = ref Unsafe.AddByteOffset(ref d, offset);
b = ref Unsafe.AddByteOffset(ref a, offset);
c = ref Unsafe.AddByteOffset(ref b, offset);
d = ref Unsafe.AddByteOffset(ref c, offset);
a = Sse41.RoundToNearestInteger(a);
b = Sse41.RoundToNearestInteger(b);
c = Sse41.RoundToNearestInteger(c);
d = Sse41.RoundToNearestInteger(d);
a = ref Unsafe.AddByteOffset(ref d, offset);
b = ref Unsafe.AddByteOffset(ref a, offset);
c = ref Unsafe.AddByteOffset(ref b, offset);
d = ref Unsafe.AddByteOffset(ref c, offset);
a = Sse41.RoundToNearestInteger(a);
b = Sse41.RoundToNearestInteger(b);
c = Sse41.RoundToNearestInteger(c);
d = Sse41.RoundToNearestInteger(d);
}
[Benchmark]
public unsafe void Sse41_V5_Unaligned()
{
float* p = this.alignedPtr + 1;
Vector128<float> v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
p += 8;
v = Sse.LoadVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.Store(p, v);
}
[Benchmark]
public unsafe void Sse41_V5_Aligned()
{
float* p = this.alignedPtr;
Vector128<float> v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
v = Sse.LoadAlignedVector128(p);
v = Sse41.RoundToNearestInteger(v);
Sse.StoreAligned(p, v);
p += 8;
}
[Benchmark]
public void Sse41_V6_Aligned()
{
float* p = this.alignedPtr;
Round8SseVectors(p);
Round8SseVectors(p + 32);
}
private static void Round8SseVectors(float* p0)
{
float* p1 = p0 + 4;
float* p2 = p1 + 4;
float* p3 = p2 + 4;
float* p4 = p3 + 4;
float* p5 = p4 + 4;
float* p6 = p5 + 4;
float* p7 = p6 + 4;
Vector128<float> v0 = Sse.LoadAlignedVector128(p0);
Vector128<float> v1 = Sse.LoadAlignedVector128(p1);
Vector128<float> v2 = Sse.LoadAlignedVector128(p2);
Vector128<float> v3 = Sse.LoadAlignedVector128(p3);
Vector128<float> v4 = Sse.LoadAlignedVector128(p4);
Vector128<float> v5 = Sse.LoadAlignedVector128(p5);
Vector128<float> v6 = Sse.LoadAlignedVector128(p6);
Vector128<float> v7 = Sse.LoadAlignedVector128(p7);
v0 = Sse41.RoundToNearestInteger(v0);
v1 = Sse41.RoundToNearestInteger(v1);
v2 = Sse41.RoundToNearestInteger(v2);
v3 = Sse41.RoundToNearestInteger(v3);
v4 = Sse41.RoundToNearestInteger(v4);
v5 = Sse41.RoundToNearestInteger(v5);
v6 = Sse41.RoundToNearestInteger(v6);
v7 = Sse41.RoundToNearestInteger(v7);
Sse.StoreAligned(p0, v0);
Sse.StoreAligned(p1, v1);
Sse.StoreAligned(p2, v2);
Sse.StoreAligned(p3, v3);
Sse.StoreAligned(p4, v4);
Sse.StoreAligned(p5, v5);
Sse.StoreAligned(p6, v6);
Sse.StoreAligned(p7, v7);
}
#endif
}
}

10
tests/ImageSharp.Benchmarks/Codecs/Jpeg/YCbCrColorConversion.cs

@ -11,7 +11,7 @@ using SixLabors.ImageSharp.Memory;
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
{
[Config(typeof(Config.ShortClr))]
[Config(typeof(Config.ShortCore31))]
public class YCbCrColorConversion
{
private Buffer2D<float>[] input;
@ -36,7 +36,7 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
}
}
[Benchmark(Baseline = true)]
[Benchmark]
public void Scalar()
{
var values = new JpegColorConverter.ComponentValues(this.input, 0);
@ -44,7 +44,7 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
JpegColorConverter.FromYCbCrBasic.ConvertCore(values, this.output, 255F, 128F);
}
[Benchmark]
[Benchmark(Baseline = true)]
public void SimdVector4()
{
var values = new JpegColorConverter.ComponentValues(this.input, 0);
@ -53,11 +53,11 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
}
[Benchmark]
public void SimdAvx2()
public void SimdVector8()
{
var values = new JpegColorConverter.ComponentValues(this.input, 0);
JpegColorConverter.FromYCbCrSimdAvx2.ConvertCore(values, this.output, 255F, 128F);
JpegColorConverter.FromYCbCrSimdVector8.ConvertCore(values, this.output, 255F, 128F);
}
private static Buffer2D<float>[] CreateRandomValues(

122
tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs

@ -7,15 +7,21 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using BenchmarkDotNet.Attributes;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Environments;
using BenchmarkDotNet.Jobs;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
{
[Config(typeof(Config.ShortClr))]
[Config(typeof(Config.ShortCore31))]
public abstract class FromVector4<TPixel>
where TPixel : unmanaged, IPixel<TPixel>
{
@ -25,7 +31,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
protected Configuration Configuration => Configuration.Default;
[Params(64, 2048)]
// [Params(64, 2048)]
[Params(1024)]
public int Count { get; set; }
[GlobalSetup]
@ -77,7 +84,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
}
[Benchmark(Baseline = true)]
[Benchmark]
public void BasicIntrinsics256()
{
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
@ -86,7 +93,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
}
[Benchmark]
[Benchmark(Baseline = true)]
public void ExtendedIntrinsic()
{
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
@ -95,31 +102,84 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
}
// RESULTS (2018 October):
// Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Gen 0 | Allocated |
// ---------------------------- |-------- |------ |-------------:|-------------:|------------:|-------:|---------:|-------:|----------:|
// FallbackIntrinsics128 | Clr | 64 | 340.38 ns | 22.319 ns | 1.2611 ns | 1.41 | 0.01 | - | 0 B |
// BasicIntrinsics256 | Clr | 64 | 240.79 ns | 11.421 ns | 0.6453 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsic | Clr | 64 | 199.09 ns | 124.239 ns | 7.0198 ns | 0.83 | 0.02 | - | 0 B |
// PixelOperations_Base | Clr | 64 | 647.99 ns | 24.003 ns | 1.3562 ns | 2.69 | 0.01 | 0.0067 | 24 B |
// PixelOperations_Specialized | Clr | 64 | 259.79 ns | 13.391 ns | 0.7566 ns | 1.08 | 0.00 | - | 0 B | <--- ceremonial overhead has been minimized!
// | | | | | | | | | |
// FallbackIntrinsics128 | Core | 64 | 234.64 ns | 12.320 ns | 0.6961 ns | 1.58 | 0.00 | - | 0 B |
// BasicIntrinsics256 | Core | 64 | 148.87 ns | 2.794 ns | 0.1579 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsic | Core | 64 | 94.06 ns | 10.015 ns | 0.5659 ns | 0.63 | 0.00 | - | 0 B |
// PixelOperations_Base | Core | 64 | 573.52 ns | 31.865 ns | 1.8004 ns | 3.85 | 0.01 | 0.0067 | 24 B |
// PixelOperations_Specialized | Core | 64 | 117.21 ns | 13.264 ns | 0.7494 ns | 0.79 | 0.00 | - | 0 B |
// | | | | | | | | | |
// FallbackIntrinsics128 | Clr | 2048 | 6,735.93 ns | 2,139.340 ns | 120.8767 ns | 1.71 | 0.03 | - | 0 B |
// BasicIntrinsics256 | Clr | 2048 | 3,929.29 ns | 334.027 ns | 18.8731 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsic | Clr | 2048 | 2,226.01 ns | 130.525 ns | 7.3749 ns |!! 0.57 | 0.00 | - | 0 B | <--- ExtendedIntrinsics rock!
// PixelOperations_Base | Clr | 2048 | 16,760.84 ns | 367.800 ns | 20.7814 ns | 4.27 | 0.02 | - | 24 B | <--- Extra copies using "Vector4 TPixel.ToVector4()"
// PixelOperations_Specialized | Clr | 2048 | 3,986.03 ns | 237.238 ns | 13.4044 ns | 1.01 | 0.00 | - | 0 B | <--- can't yet detect whether ExtendedIntrinsics are available :(
// | | | | | | | | | |
// FallbackIntrinsics128 | Core | 2048 | 6,644.65 ns | 2,677.090 ns | 151.2605 ns | 1.69 | 0.05 | - | 0 B |
// BasicIntrinsics256 | Core | 2048 | 3,923.70 ns | 1,971.760 ns | 111.4081 ns | 1.00 | 0.00 | - | 0 B |
// ExtendedIntrinsic | Core | 2048 | 2,092.32 ns | 375.657 ns | 21.2253 ns |!! 0.53 | 0.01 | - | 0 B | <--- ExtendedIntrinsics rock!
// PixelOperations_Base | Core | 2048 | 16,875.73 ns | 1,271.957 ns | 71.8679 ns | 4.30 | 0.10 | - | 24 B |
// PixelOperations_Specialized | Core | 2048 | 2,129.92 ns | 262.888 ns | 14.8537 ns |!! 0.54 | 0.01 | - | 0 B | <--- ExtendedIntrinsics rock!
#if SUPPORTS_RUNTIME_INTRINSICS
[Benchmark]
public void UseAvx2()
{
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
SimdUtils.Avx2Intrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
}
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
[Benchmark]
public void UseAvx2_Grouped()
{
Span<float> src = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
Span<byte> dest = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
int n = dest.Length / Vector<byte>.Count;
ref Vector256<float> sourceBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(src));
ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);
var maxBytes = Vector256.Create(255f);
for (int i = 0; i < n; i++)
{
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
Vector256<float> f0 = s;
Vector256<float> f1 = Unsafe.Add(ref s, 1);
Vector256<float> f2 = Unsafe.Add(ref s, 2);
Vector256<float> f3 = Unsafe.Add(ref s, 3);
f0 = Avx.Multiply(maxBytes, f0);
f1 = Avx.Multiply(maxBytes, f1);
f2 = Avx.Multiply(maxBytes, f2);
f3 = Avx.Multiply(maxBytes, f3);
Vector256<int> w0 = Avx.ConvertToVector256Int32(f0);
Vector256<int> w1 = Avx.ConvertToVector256Int32(f1);
Vector256<int> w2 = Avx.ConvertToVector256Int32(f2);
Vector256<int> w3 = Avx.ConvertToVector256Int32(f3);
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
Unsafe.Add(ref destBase, i) = b;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale)
{
vf = Avx.Multiply(scale, vf);
return Avx.ConvertToVector256Int32(vf);
}
#endif
// *** RESULTS 2020 March: ***
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
// .NET Core SDK=3.1.200-preview-014971
// Job-IUZXZT : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT
//
// | Method | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
// |---------------------------- |------ |-----------:|------------:|----------:|------:|--------:|------:|------:|------:|----------:|
// | FallbackIntrinsics128 | 1024 | 2,952.6 ns | 1,680.77 ns | 92.13 ns | 3.32 | 0.16 | - | - | - | - |
// | BasicIntrinsics256 | 1024 | 1,664.5 ns | 928.11 ns | 50.87 ns | 1.87 | 0.09 | - | - | - | - |
// | ExtendedIntrinsic | 1024 | 890.6 ns | 375.48 ns | 20.58 ns | 1.00 | 0.00 | - | - | - | - |
// | UseAvx2 | 1024 | 299.0 ns | 30.47 ns | 1.67 ns | 0.34 | 0.01 | - | - | - | - |
// | UseAvx2_Grouped | 1024 | 318.1 ns | 48.19 ns | 2.64 ns | 0.36 | 0.01 | - | - | - | - |
// | PixelOperations_Base | 1024 | 8,136.9 ns | 1,834.82 ns | 100.57 ns | 9.14 | 0.26 | - | - | - | 24 B |
// | PixelOperations_Specialized | 1024 | 951.1 ns | 123.93 ns | 6.79 ns | 1.07 | 0.03 | - | - | - | - |
}
}

8
tests/ImageSharp.Benchmarks/Config.cs

@ -38,6 +38,14 @@ namespace SixLabors.ImageSharp.Benchmarks
}
}
public class ShortCore31 : Config
{
public ShortCore31()
{
this.Add(Job.Default.With(CoreRuntime.Core31).WithLaunchCount(1).WithWarmupCount(3).WithIterationCount(3));
}
}
#if Windows_NT
private bool IsElevated
{

19
tests/ImageSharp.Tests/Common/SimdUtilsTests.cs

@ -6,6 +6,7 @@ using System.Linq;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Common.Tuples;
using Xunit;
@ -277,6 +278,24 @@ namespace SixLabors.ImageSharp.Tests.Common
Assert.Equal(expected2, actual2);
}
#if SUPPORTS_RUNTIME_INTRINSICS
[Theory]
[MemberData(nameof(ArraySizesDivisibleBy32))]
public void Avx2_BulkConvertNormalizedFloatToByteClampOverflows(int count)
{
if (!System.Runtime.Intrinsics.X86.Avx2.IsSupported)
{
return;
}
TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
count,
(s, d) => SimdUtils.Avx2Intrinsics.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span));
}
#endif
[Theory]
[MemberData(nameof(ArbitraryArraySizes))]
public void BulkConvertNormalizedFloatToByteClampOverflows(int count)

2
tests/ImageSharp.Tests/Formats/Jpg/JpegColorConverterTests.cs

@ -107,7 +107,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
// JpegColorConverter.FromYCbCrSimdAvx2.LogPlz = s => this.Output.WriteLine(s);
ValidateRgbToYCbCrConversion(
new JpegColorConverter.FromYCbCrSimdAvx2(8),
new JpegColorConverter.FromYCbCrSimdVector8(8),
3,
inputBufferLength,
resultBufferLength,

Loading…
Cancel
Save