mirror of https://github.com/SixLabors/ImageSharp
committed by
GitHub
35 changed files with 1977 additions and 1551 deletions
@ -0,0 +1,149 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|||
{ |
|||
internal partial struct Block8x8F |
|||
{ |
|||
/// <summary>
|
|||
/// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
|
|||
/// </summary>
|
|||
public const int RowCount = 8; |
|||
|
|||
[FieldOffset(0)] |
|||
public Vector256<float> V0; |
|||
[FieldOffset(32)] |
|||
public Vector256<float> V1; |
|||
[FieldOffset(64)] |
|||
public Vector256<float> V2; |
|||
[FieldOffset(96)] |
|||
public Vector256<float> V3; |
|||
[FieldOffset(128)] |
|||
public Vector256<float> V4; |
|||
[FieldOffset(160)] |
|||
public Vector256<float> V5; |
|||
[FieldOffset(192)] |
|||
public Vector256<float> V6; |
|||
[FieldOffset(224)] |
|||
public Vector256<float> V7; |
|||
|
|||
private static readonly Vector256<int> MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); |
|||
|
|||
private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) |
|||
{ |
|||
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); |
|||
|
|||
ref Vector256<float> aBase = ref a.V0; |
|||
ref Vector256<float> bBase = ref b.V0; |
|||
|
|||
ref Vector256<short> destRef = ref dest.V01; |
|||
|
|||
for (nint i = 0; i < 8; i += 2) |
|||
{ |
|||
Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); |
|||
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); |
|||
|
|||
Vector256<short> row = Avx2.PackSignedSaturate(row0, row1); |
|||
row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16(); |
|||
|
|||
Unsafe.Add(ref destRef, (IntPtr)((uint)i / 2)) = row; |
|||
} |
|||
} |
|||
|
|||
private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) |
|||
{ |
|||
DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!"); |
|||
|
|||
ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a); |
|||
ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b); |
|||
|
|||
ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest); |
|||
|
|||
for (int i = 0; i < 16; i += 2) |
|||
{ |
|||
Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); |
|||
Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); |
|||
|
|||
Vector128<short> row = Sse2.PackSignedSaturate(left, right); |
|||
Unsafe.Add(ref destBase, (IntPtr)((uint)i / 2)) = row; |
|||
} |
|||
} |
|||
|
|||
private void TransposeInplace_Avx() |
|||
{ |
|||
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
|
|||
Vector256<float> r0 = Avx.InsertVector128( |
|||
this.V0, |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L), |
|||
1); |
|||
|
|||
Vector256<float> r1 = Avx.InsertVector128( |
|||
this.V1, |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V5L), |
|||
1); |
|||
|
|||
Vector256<float> r2 = Avx.InsertVector128( |
|||
this.V2, |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V6L), |
|||
1); |
|||
|
|||
Vector256<float> r3 = Avx.InsertVector128( |
|||
this.V3, |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V7L), |
|||
1); |
|||
|
|||
Vector256<float> r4 = Avx.InsertVector128( |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(), |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V4R), |
|||
1); |
|||
|
|||
Vector256<float> r5 = Avx.InsertVector128( |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(), |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V5R), |
|||
1); |
|||
|
|||
Vector256<float> r6 = Avx.InsertVector128( |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(), |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V6R), |
|||
1); |
|||
|
|||
Vector256<float> r7 = Avx.InsertVector128( |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(), |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V7R), |
|||
1); |
|||
|
|||
Vector256<float> t0 = Avx.UnpackLow(r0, r1); |
|||
Vector256<float> t2 = Avx.UnpackLow(r2, r3); |
|||
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E); |
|||
this.V0 = Avx.Blend(t0, v, 0xCC); |
|||
this.V1 = Avx.Blend(t2, v, 0x33); |
|||
|
|||
Vector256<float> t4 = Avx.UnpackLow(r4, r5); |
|||
Vector256<float> t6 = Avx.UnpackLow(r6, r7); |
|||
v = Avx.Shuffle(t4, t6, 0x4E); |
|||
this.V4 = Avx.Blend(t4, v, 0xCC); |
|||
this.V5 = Avx.Blend(t6, v, 0x33); |
|||
|
|||
Vector256<float> t1 = Avx.UnpackHigh(r0, r1); |
|||
Vector256<float> t3 = Avx.UnpackHigh(r2, r3); |
|||
v = Avx.Shuffle(t1, t3, 0x4E); |
|||
this.V2 = Avx.Blend(t1, v, 0xCC); |
|||
this.V3 = Avx.Blend(t3, v, 0x33); |
|||
|
|||
Vector256<float> t5 = Avx.UnpackHigh(r4, r5); |
|||
Vector256<float> t7 = Avx.UnpackHigh(r6, r7); |
|||
v = Avx.Shuffle(t5, t7, 0x4E); |
|||
this.V6 = Avx.Blend(t5, v, 0xCC); |
|||
this.V7 = Avx.Blend(t7, v, 0x33); |
|||
} |
|||
} |
|||
} |
|||
#endif
|
|||
@ -0,0 +1,161 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System.Diagnostics; |
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|||
{ |
|||
internal static partial class FastFloatingPointDCT |
|||
{ |
|||
#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
|
|||
private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f); |
|||
private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f); |
|||
private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f); |
|||
private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f); |
|||
|
|||
private static readonly Vector256<float> mm256_F_1_1758 = Vector256.Create(1.175876f); |
|||
private static readonly Vector256<float> mm256_F_n1_9615 = Vector256.Create(-1.961570560f); |
|||
private static readonly Vector256<float> mm256_F_n0_3901 = Vector256.Create(-0.390180644f); |
|||
private static readonly Vector256<float> mm256_F_n0_8999 = Vector256.Create(-0.899976223f); |
|||
private static readonly Vector256<float> mm256_F_n2_5629 = Vector256.Create(-2.562915447f); |
|||
private static readonly Vector256<float> mm256_F_0_2986 = Vector256.Create(0.298631336f); |
|||
private static readonly Vector256<float> mm256_F_2_0531 = Vector256.Create(2.053119869f); |
|||
private static readonly Vector256<float> mm256_F_3_0727 = Vector256.Create(3.072711026f); |
|||
private static readonly Vector256<float> mm256_F_1_5013 = Vector256.Create(1.501321110f); |
|||
private static readonly Vector256<float> mm256_F_n1_8477 = Vector256.Create(-1.847759065f); |
|||
private static readonly Vector256<float> mm256_F_0_7653 = Vector256.Create(0.765366865f); |
|||
#pragma warning restore SA1310, SA1311, IDE1006
|
|||
|
|||
/// <summary>
|
|||
/// Apply floating point FDCT inplace using simd operations.
|
|||
/// </summary>
|
|||
/// <param name="block">Input matrix.</param>
|
|||
private static void ForwardTransform_Avx(ref Block8x8F block) |
|||
{ |
|||
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); |
|||
|
|||
// First pass - process rows
|
|||
block.TransposeInplace(); |
|||
FDCT8x8_Avx(ref block); |
|||
|
|||
// Second pass - process columns
|
|||
block.TransposeInplace(); |
|||
FDCT8x8_Avx(ref block); |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix.
|
|||
/// </summary>
|
|||
/// <remarks>
|
|||
/// Requires Avx support.
|
|||
/// </remarks>
|
|||
/// <param name="block">Input matrix.</param>
|
|||
public static void FDCT8x8_Avx(ref Block8x8F block) |
|||
{ |
|||
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); |
|||
|
|||
Vector256<float> tmp0 = Avx.Add(block.V0, block.V7); |
|||
Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7); |
|||
Vector256<float> tmp1 = Avx.Add(block.V1, block.V6); |
|||
Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6); |
|||
Vector256<float> tmp2 = Avx.Add(block.V2, block.V5); |
|||
Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5); |
|||
Vector256<float> tmp3 = Avx.Add(block.V3, block.V4); |
|||
Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4); |
|||
|
|||
// Even part
|
|||
Vector256<float> tmp10 = Avx.Add(tmp0, tmp3); |
|||
Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3); |
|||
Vector256<float> tmp11 = Avx.Add(tmp1, tmp2); |
|||
Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2); |
|||
|
|||
block.V0 = Avx.Add(tmp10, tmp11); |
|||
block.V4 = Avx.Subtract(tmp10, tmp11); |
|||
|
|||
Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); |
|||
block.V2 = Avx.Add(tmp13, z1); |
|||
block.V6 = Avx.Subtract(tmp13, z1); |
|||
|
|||
// Odd part
|
|||
tmp10 = Avx.Add(tmp4, tmp5); |
|||
tmp11 = Avx.Add(tmp5, tmp6); |
|||
tmp12 = Avx.Add(tmp6, tmp7); |
|||
|
|||
Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); |
|||
Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10); |
|||
Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12); |
|||
Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071); |
|||
|
|||
Vector256<float> z11 = Avx.Add(tmp7, z3); |
|||
Vector256<float> z13 = Avx.Subtract(tmp7, z3); |
|||
|
|||
block.V5 = Avx.Add(z13, z2); |
|||
block.V3 = Avx.Subtract(z13, z2); |
|||
block.V1 = Avx.Add(z11, z4); |
|||
block.V7 = Avx.Subtract(z11, z4); |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
|
|||
/// using AVX commands.
|
|||
/// </summary>
|
|||
/// <param name="s">Source</param>
|
|||
/// <param name="d">Destination</param>
|
|||
public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) |
|||
{ |
|||
Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); |
|||
|
|||
Vector256<float> my1 = s.V1; |
|||
Vector256<float> my7 = s.V7; |
|||
Vector256<float> mz0 = Avx.Add(my1, my7); |
|||
|
|||
Vector256<float> my3 = s.V3; |
|||
Vector256<float> mz2 = Avx.Add(my3, my7); |
|||
Vector256<float> my5 = s.V5; |
|||
Vector256<float> mz1 = Avx.Add(my3, my5); |
|||
Vector256<float> mz3 = Avx.Add(my1, my5); |
|||
|
|||
Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758); |
|||
|
|||
mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615); |
|||
mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901); |
|||
mz0 = Avx.Multiply(mz0, mm256_F_n0_8999); |
|||
mz1 = Avx.Multiply(mz1, mm256_F_n2_5629); |
|||
|
|||
Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2); |
|||
Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3); |
|||
Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2); |
|||
Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3); |
|||
|
|||
Vector256<float> my2 = s.V2; |
|||
Vector256<float> my6 = s.V6; |
|||
mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411); |
|||
Vector256<float> my0 = s.V0; |
|||
Vector256<float> my4 = s.V4; |
|||
mz0 = Avx.Add(my0, my4); |
|||
mz1 = Avx.Subtract(my0, my4); |
|||
mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477); |
|||
mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653); |
|||
|
|||
my0 = Avx.Add(mz0, mz3); |
|||
my3 = Avx.Subtract(mz0, mz3); |
|||
my1 = Avx.Add(mz1, mz2); |
|||
my2 = Avx.Subtract(mz1, mz2); |
|||
|
|||
d.V0 = Avx.Add(my0, mb0); |
|||
d.V7 = Avx.Subtract(my0, mb0); |
|||
d.V1 = Avx.Add(my1, mb1); |
|||
d.V6 = Avx.Subtract(my1, mb1); |
|||
d.V2 = Avx.Add(my2, mb2); |
|||
d.V5 = Avx.Subtract(my2, mb2); |
|||
d.V3 = Avx.Add(my3, mb3); |
|||
d.V4 = Avx.Subtract(my3, mb3); |
|||
} |
|||
} |
|||
} |
|||
#endif
|
|||
@ -0,0 +1,300 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
using System; |
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components |
|||
{ |
|||
internal static partial class ZigZag |
|||
{ |
|||
#pragma warning disable SA1309 // naming rules violation warnings
|
|||
/// <summary>
|
|||
/// Special byte value to zero out elements during Sse/Avx shuffle intrinsics.
|
|||
/// </summary>
|
|||
private const byte _ = 0xff; |
|||
#pragma warning restore SA1309
|
|||
|
|||
/// <summary>
|
|||
/// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSsse3"/>
|
|||
/// zig zag implementation.
|
|||
/// </summary>
|
|||
private static ReadOnlySpan<byte> SseShuffleMasks => new byte[] |
|||
{ |
|||
// row0
|
|||
0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _, |
|||
_, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5, |
|||
_, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, |
|||
|
|||
// row1
|
|||
_, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11, |
|||
2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _, |
|||
_, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _, |
|||
|
|||
// row2
|
|||
_, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, |
|||
_, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, |
|||
|
|||
// row3
|
|||
_, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _, |
|||
_, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _, |
|||
_, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, |
|||
6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, |
|||
|
|||
// row4
|
|||
_, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _, |
|||
_, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, |
|||
_, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _, |
|||
|
|||
// row5
|
|||
_, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, |
|||
10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _, |
|||
|
|||
// row6
|
|||
_, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _, |
|||
_, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13, |
|||
4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, |
|||
|
|||
// row7
|
|||
10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, |
|||
_, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15 |
|||
}; |
|||
|
|||
/// <summary>
|
|||
/// Gets shuffle vectors for <see cref="ApplyZigZagOrderingAvx2"/>
|
|||
/// zig zag implementation.
|
|||
/// </summary>
|
|||
private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[] |
|||
{ |
|||
// 01_AB/01_EF/23_CD - cross-lane
|
|||
0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, |
|||
|
|||
// 01_AB - inner-lane
|
|||
0, 1, 2, 3, 8, 9, _, _, 10, 11, 4, 5, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, 6, 7, |
|||
|
|||
// 01_CD/23_GH - cross-lane
|
|||
0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, |
|||
|
|||
// 01_CD - inner-lane
|
|||
_, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, |
|||
|
|||
// 01_EF - inner-lane
|
|||
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, |
|||
|
|||
// 23_AB/45_CD/67_EF - cross-lane
|
|||
3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, |
|||
|
|||
// 23_AB - inner-lane
|
|||
4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, 2, 3, 8, 9, _, _, _, _, |
|||
|
|||
// 23_CD - inner-lane
|
|||
_, _, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, 6, 7, 12, 13, |
|||
|
|||
// 23_EF - inner-lane
|
|||
_, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, |
|||
|
|||
// 23_GH - inner-lane
|
|||
_, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, |
|||
|
|||
// 45_AB - inner-lane
|
|||
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, |
|||
|
|||
// 45_CD - inner-lane
|
|||
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, |
|||
|
|||
// 45_EF - cross-lane
|
|||
1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, _, _, _, _, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, |
|||
|
|||
// 45_EF - inner-lane
|
|||
2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, |
|||
|
|||
// 45_GH - inner-lane
|
|||
_, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, |
|||
|
|||
// 67_CD - inner-lane
|
|||
_, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, |
|||
|
|||
// 67_EF - inner-lane
|
|||
_, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, |
|||
|
|||
// 67_GH - inner-lane
|
|||
8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, 6, 7, 12, 13, 14, 15 |
|||
}; |
|||
|
|||
/// <summary>
|
|||
/// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
|
|||
/// </summary>
|
|||
/// <param name="block">Input matrix.</param>
|
|||
public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block) |
|||
{ |
|||
DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); |
|||
|
|||
fixed (byte* maskPtr = SseShuffleMasks) |
|||
{ |
|||
Vector128<byte> rowA = block.V0.AsByte(); |
|||
Vector128<byte> rowB = block.V1.AsByte(); |
|||
Vector128<byte> rowC = block.V2.AsByte(); |
|||
Vector128<byte> rowD = block.V3.AsByte(); |
|||
Vector128<byte> rowE = block.V4.AsByte(); |
|||
Vector128<byte> rowF = block.V5.AsByte(); |
|||
Vector128<byte> rowG = block.V6.AsByte(); |
|||
Vector128<byte> rowH = block.V7.AsByte(); |
|||
|
|||
// row0 - A0 A1 B0 C0 B1 A2 A3 B2
|
|||
Vector128<short> rowA0 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 0))).AsInt16(); |
|||
Vector128<short> rowB0 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 1))).AsInt16(); |
|||
Vector128<short> row0 = Sse2.Or(rowA0, rowB0); |
|||
Vector128<short> rowC0 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 2))).AsInt16(); |
|||
row0 = Sse2.Or(row0, rowC0); |
|||
|
|||
// row1 - C1 D0 E0 D1 C2 B3 A4 A5
|
|||
Vector128<short> rowA1 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 3))).AsInt16(); |
|||
Vector128<short> rowC1 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 4))).AsInt16(); |
|||
Vector128<short> row1 = Sse2.Or(rowA1, rowC1); |
|||
Vector128<short> rowD1 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 5))).AsInt16(); |
|||
row1 = Sse2.Or(row1, rowD1); |
|||
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 3), 5).AsInt16(); |
|||
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 2).AsInt16(); |
|||
|
|||
// row2
|
|||
Vector128<short> rowE2 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 6))).AsInt16(); |
|||
Vector128<short> rowF2 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 7))).AsInt16(); |
|||
Vector128<short> row2 = Sse2.Or(rowE2, rowF2); |
|||
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 4), 0).AsInt16(); |
|||
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 3), 1).AsInt16(); |
|||
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 2).AsInt16(); |
|||
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 0), 5).AsInt16(); |
|||
|
|||
// row3
|
|||
Vector128<short> rowA3 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 8))).AsInt16().AsInt16(); |
|||
Vector128<short> rowB3 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 9))).AsInt16().AsInt16(); |
|||
Vector128<short> row3 = Sse2.Or(rowA3, rowB3); |
|||
Vector128<short> rowC3 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 10))).AsInt16(); |
|||
row3 = Sse2.Or(row3, rowC3); |
|||
Vector128<byte> shuffleRowD3EF = Sse2.LoadVector128(maskPtr + (16 * 11)); |
|||
Vector128<short> rowD3 = Ssse3.Shuffle(rowD, shuffleRowD3EF).AsInt16(); |
|||
row3 = Sse2.Or(row3, rowD3); |
|||
|
|||
// row4
|
|||
Vector128<short> rowE4 = Ssse3.Shuffle(rowE, shuffleRowD3EF).AsInt16(); |
|||
Vector128<short> rowF4 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 12))).AsInt16(); |
|||
Vector128<short> row4 = Sse2.Or(rowE4, rowF4); |
|||
Vector128<short> rowG4 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 13))).AsInt16(); |
|||
row4 = Sse2.Or(row4, rowG4); |
|||
Vector128<short> rowH4 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 14))).AsInt16(); |
|||
row4 = Sse2.Or(row4, rowH4); |
|||
|
|||
// row5
|
|||
Vector128<short> rowC5 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 15))).AsInt16(); |
|||
Vector128<short> rowD5 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); |
|||
Vector128<short> row5 = Sse2.Or(rowC5, rowD5); |
|||
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 7), 2).AsInt16(); |
|||
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 5).AsInt16(); |
|||
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 4), 6).AsInt16(); |
|||
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 3), 7).AsInt16(); |
|||
|
|||
// row6
|
|||
Vector128<short> rowE6 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 17))).AsInt16(); |
|||
Vector128<short> rowF6 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 18))).AsInt16(); |
|||
Vector128<short> row6 = Sse2.Or(rowE6, rowF6); |
|||
Vector128<short> rowH6 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 19))).AsInt16(); |
|||
row6 = Sse2.Or(row6, rowH6); |
|||
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 5).AsInt16(); |
|||
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 4), 2).AsInt16(); |
|||
|
|||
// row7
|
|||
Vector128<short> rowG7 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 20))).AsInt16(); |
|||
Vector128<short> rowH7 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 21))).AsInt16(); |
|||
Vector128<short> row7 = Sse2.Or(rowG7, rowH7); |
|||
row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 7), 4).AsInt16(); |
|||
|
|||
block.V0 = row0; |
|||
block.V1 = row1; |
|||
block.V2 = row2; |
|||
block.V3 = row3; |
|||
block.V4 = row4; |
|||
block.V5 = row5; |
|||
block.V6 = row6; |
|||
block.V7 = row7; |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics.
|
|||
/// </summary>
|
|||
/// <param name="block">Input matrix.</param>
|
|||
public static unsafe void ApplyZigZagOrderingAvx2(ref Block8x8 block) |
|||
{ |
|||
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); |
|||
|
|||
fixed (byte* shuffleVectorsPtr = AvxShuffleMasks) |
|||
{ |
|||
Vector256<byte> rowsAB = block.V01.AsByte(); |
|||
Vector256<byte> rowsCD = block.V23.AsByte(); |
|||
Vector256<byte> rowsEF = block.V45.AsByte(); |
|||
Vector256<byte> rowsGH = block.V67.AsByte(); |
|||
|
|||
// rows 0 1
|
|||
Vector256<int> rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); |
|||
Vector256<byte> row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); |
|||
row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte(); |
|||
|
|||
Vector256<int> rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32(); |
|||
Vector256<byte> row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); |
|||
row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte(); |
|||
|
|||
Vector256<byte> row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); |
|||
Vector256<byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte(); |
|||
|
|||
Vector256<byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF); |
|||
|
|||
// rows 2 3
|
|||
Vector256<int> rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); |
|||
Vector256<byte> row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); |
|||
Vector256<byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte(); |
|||
|
|||
Vector256<byte> row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); |
|||
row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte(); |
|||
|
|||
Vector256<byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte(); |
|||
|
|||
Vector256<byte> row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); |
|||
Vector256<byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte()); |
|||
|
|||
Vector256<byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH)); |
|||
|
|||
// rows 4 5
|
|||
Vector256<byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte()); |
|||
Vector256<byte> row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); |
|||
Vector256<byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte()); |
|||
|
|||
Vector256<int> rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32(); |
|||
Vector256<byte> row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); |
|||
row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte()); |
|||
|
|||
Vector256<byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte()); |
|||
|
|||
Vector256<byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH)); |
|||
|
|||
// rows 6 7
|
|||
Vector256<byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte()); |
|||
|
|||
Vector256<byte> row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); |
|||
row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte()); |
|||
|
|||
Vector256<byte> row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); |
|||
row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte()); |
|||
|
|||
Vector256<byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH); |
|||
|
|||
block.V01 = row01.AsInt16(); |
|||
block.V23 = row23.AsInt16(); |
|||
block.V45 = row45.AsInt16(); |
|||
block.V67 = row67.AsInt16(); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
#endif
|
|||
@ -0,0 +1,50 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class Block8x8F_Quantize |
|||
{ |
|||
private Block8x8F block = CreateFromScalar(1); |
|||
private Block8x8F quant = CreateFromScalar(1); |
|||
private Block8x8 result = default; |
|||
|
|||
[Benchmark] |
|||
public short Quantize() |
|||
{ |
|||
Block8x8F.Quantize(ref this.block, ref this.result, ref this.quant); |
|||
return this.result[0]; |
|||
} |
|||
|
|||
private static Block8x8F CreateFromScalar(float scalar) |
|||
{ |
|||
Block8x8F block = default; |
|||
for (int i = 0; i < 64; i++) |
|||
{ |
|||
block[i] = scalar; |
|||
} |
|||
|
|||
return block; |
|||
} |
|||
} |
|||
} |
|||
|
|||
/* |
|||
BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update) |
|||
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores |
|||
.NET SDK=6.0.100-preview.3.21202.5 |
|||
[Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT |
|||
1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT |
|||
2. SSE : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT |
|||
3. AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT |
|||
|
|||
| Method | Job | Mean | Error | StdDev | Ratio | |
|||
|--------- |-----------------|---------:|---------:|---------:|------:| |
|||
| Quantize | No HwIntrinsics | 73.34 ns | 1.081 ns | 1.011 ns | 1.00 | |
|||
| Quantize | SSE | 24.11 ns | 0.298 ns | 0.279 ns | 0.33 | |
|||
| Quantize | AVX | 15.90 ns | 0.074 ns | 0.065 ns | 0.22 | |
|||
*/ |
|||
Loading…
Reference in new issue