Browse Source

Migrate from Sse to general Vector128 for ZigZag

pull/2918/head
James Jackson-South 1 year ago
parent
commit
30bdc29e40
  1. 13
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
  2. 2
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
  3. 12
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
  4. 135
      src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs

13
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs

@ -3,7 +3,6 @@
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
@ -45,22 +44,20 @@ internal partial struct Block8x8F
private static Vector128<float> NormalizeAndRoundVector128(Vector128<float> value, Vector128<float> off, Vector128<float> max)
=> Vector128_.RoundToNearestInteger(Vector128_.Clamp(value + off, Vector128<float>.Zero, max));
private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
private static void MultiplyIntoInt16Vector128(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!");
ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
// TODO: We can use the v128 utilities for this.
for (nuint i = 0; i < 16; i += 2)
{
Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector128<int> left = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0));
Vector128<int> right = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1));
Unsafe.Add(ref destBase, i / 2) = Sse2.PackSignedSaturate(left, right);
Unsafe.Add(ref destBase, i / 2) = Vector128_.PackSignedSaturate(left, right);
}
}
}

2
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs

@ -121,7 +121,7 @@ internal partial struct Block8x8F
}
}
private void TransposeInplace_Avx()
private void TransposeInPlace_Avx()
{
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
Vector256<float> r0 = Avx.InsertVector128(

12
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs

@ -282,10 +282,10 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest);
}
else if (Ssse3.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
ZigZag.ApplyTransposingZigZagOrderingSsse3(ref dest);
MultiplyIntoInt16Vector128(ref block, ref qt, ref dest);
ZigZag.ApplyTransposingZigZagOrderingVector128(ref dest);
}
else
{
@ -387,7 +387,7 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)]
public void LoadFrom(ref Block8x8 source)
{
if (SimdUtils.HasVector8)
if (Avx2.IsSupported)
{
this.LoadFromInt16ExtendedAvx2(ref source);
return;
@ -483,6 +483,7 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
/// <param name="value">Value to compare to.</param>
public bool EqualsToScalar(int value)
{
// TODO: Can we provide a Vector128 implementation for this?
if (Avx2.IsSupported)
{
const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
@ -585,10 +586,11 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
{
if (Avx.IsSupported)
{
this.TransposeInplace_Avx();
this.TransposeInPlace_Avx();
}
else
{
// TODO: Can we provide a Vector128 implementation for this?
this.TransposeInPlace_Scalar();
}
}

135
src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs

@ -1,6 +1,9 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
@ -17,11 +20,11 @@ internal static partial class ZigZag
#pragma warning restore SA1309
/// <summary>
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingSsse3"/>
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingVector128"/>
/// zig zag implementation.
/// </summary>
private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
{
private static ReadOnlySpan<byte> SseShuffleMasks =>
[
#pragma warning disable SA1515
/* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */
// A
@ -83,14 +86,14 @@ internal static partial class ZigZag
// H
_, _, _, _, _, _, _, _, 10, 11, 12, 13, _, _, 14, 15,
#pragma warning restore SA1515
};
];
/// <summary>
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingAvx2"/>
/// zig zag implementation.
/// </summary>
private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[]
{
private static ReadOnlySpan<byte> AvxShuffleMasks =>
[
#pragma warning disable SA1515
/* 01 */
// [cr] crln_01_AB_CD
@ -138,15 +141,15 @@ internal static partial class ZigZag
// (in) GH
_, _, _, _, _, _, _, _, 0, 1, 10, 11, 12, 13, 2, 3, _, _, _, _, _, _, 0, 1, 6, 7, 8, 9, 2, 3, 10, 11,
#pragma warning restore SA1515
};
];
/// <summary>
/// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
/// Applies zig zag ordering for given 8x8 matrix using <see cref="Vector128{T}"/> cpu intrinsics.
/// </summary>
/// <param name="block">Input matrix.</param>
public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block)
public static unsafe void ApplyTransposingZigZagOrderingVector128(ref Block8x8 block)
{
DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!");
fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks))
{
@ -160,68 +163,68 @@ internal static partial class ZigZag
Vector128<byte> rowH = block.V7.AsByte();
// row0 - A0 B0 A1 A2 B1 C0 D0 C1
Vector128<short> row0_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 0))).AsInt16();
Vector128<short> row0_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 1))).AsInt16();
Vector128<short> row0_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 2))).AsInt16();
Vector128<short> row0 = Sse2.Or(Sse2.Or(row0_A, row0_B), row0_C);
row0 = Sse2.Insert(row0.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 0), 6).AsInt16();
Vector128<short> row0_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 0))).AsInt16();
Vector128<short> row0_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 1))).AsInt16();
Vector128<short> row0_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 2))).AsInt16();
Vector128<short> row0 = row0_A | row0_B | row0_C;
row0 = row0.AsUInt16().WithElement(6, rowD.AsUInt16().GetElement(0)).AsInt16();
// row1 - B2 A3 A4 B3 C2 D1 E0 F0
Vector128<short> row1_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 3))).AsInt16();
Vector128<short> row1_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 4))).AsInt16();
Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 2), 4).AsInt16();
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 1), 5).AsInt16();
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 6).AsInt16();
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 0), 7).AsInt16();
Vector128<short> row1_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 3))).AsInt16();
Vector128<short> row1_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 4))).AsInt16();
Vector128<short> row1 = row1_A | row1_B;
row1 = row1.AsUInt16().WithElement(4, rowC.AsUInt16().GetElement(2)).AsInt16();
row1 = row1.AsUInt16().WithElement(5, rowD.AsUInt16().GetElement(1)).AsInt16();
row1 = row1.AsUInt16().WithElement(6, rowE.AsUInt16().GetElement(0)).AsInt16();
row1 = row1.AsUInt16().WithElement(7, rowF.AsUInt16().GetElement(0)).AsInt16();
// row2 - E1 D2 C3 B4 A5 A6 B5 C4
Vector128<short> row2_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 5))).AsInt16();
Vector128<short> row2_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 6))).AsInt16();
Vector128<short> row2_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 7))).AsInt16();
Vector128<short> row2 = Sse2.Or(Sse2.Or(row2_A, row2_B), row2_C);
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 1).AsInt16();
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 1), 0).AsInt16();
Vector128<short> row2_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 5))).AsInt16();
Vector128<short> row2_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 6))).AsInt16();
Vector128<short> row2_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 7))).AsInt16();
Vector128<short> row2 = row2_A | row2_B | row2_C;
row2 = row2.AsUInt16().WithElement(1, rowD.AsUInt16().GetElement(2)).AsInt16();
row2 = row2.AsUInt16().WithElement(0, rowE.AsUInt16().GetElement(1)).AsInt16();
// row3 - D3 E2 F1 G0 H0 G1 F2 E3
Vector128<short> row3_E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 8))).AsInt16();
Vector128<short> row3_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 9))).AsInt16();
Vector128<short> row3_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 10))).AsInt16();
Vector128<short> row3 = Sse2.Or(Sse2.Or(row3_E, row3_F), row3_G);
row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 3), 0).AsInt16();
row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowH.AsUInt16(), 0), 4).AsInt16();
Vector128<short> row3_E = ZShuffle(rowE, Vector128.Load(shuffleVectorsPtr + (16 * 8))).AsInt16();
Vector128<short> row3_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 9))).AsInt16();
Vector128<short> row3_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 10))).AsInt16();
Vector128<short> row3 = row3_E | row3_F | row3_G;
row3 = row3.AsUInt16().WithElement(0, rowD.AsUInt16().GetElement(3)).AsInt16();
row3 = row3.AsUInt16().WithElement(4, rowH.AsUInt16().GetElement(0)).AsInt16();
// row4 - D4 C5 B6 A7 B7 C6 D5 E4
Vector128<short> row4_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 11))).AsInt16();
Vector128<short> row4_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 12))).AsInt16();
Vector128<short> row4_D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 13))).AsInt16();
Vector128<short> row4 = Sse2.Or(Sse2.Or(row4_B, row4_C), row4_D);
row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowA.AsUInt16(), 7), 3).AsInt16();
row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 4), 7).AsInt16();
Vector128<short> row4_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 11))).AsInt16();
Vector128<short> row4_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 12))).AsInt16();
Vector128<short> row4_D = ZShuffle(rowD, Vector128.Load(shuffleVectorsPtr + (16 * 13))).AsInt16();
Vector128<short> row4 = row4_B | row4_C | row4_D;
row4 = row4.AsUInt16().WithElement(3, rowA.AsUInt16().GetElement(7)).AsInt16();
row4 = row4.AsUInt16().WithElement(7, rowE.AsUInt16().GetElement(4)).AsInt16();
// row5 - F3 G2 H1 H2 G3 F4 E5 D6
Vector128<short> row5_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 14))).AsInt16();
Vector128<short> row5_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 15))).AsInt16();
Vector128<short> row5_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 16))).AsInt16();
Vector128<short> row5 = Sse2.Or(Sse2.Or(row5_F, row5_G), row5_H);
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 6), 7).AsInt16();
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 6).AsInt16();
Vector128<short> row5_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 14))).AsInt16();
Vector128<short> row5_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 15))).AsInt16();
Vector128<short> row5_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 16))).AsInt16();
Vector128<short> row5 = row5_F | row5_G | row5_H;
row5 = row5.AsUInt16().WithElement(7, rowD.AsUInt16().GetElement(6)).AsInt16();
row5 = row5.AsUInt16().WithElement(6, rowE.AsUInt16().GetElement(5)).AsInt16();
// row6 - C7 D7 E6 F5 G4 H3 H4 G5
Vector128<short> row6_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 17))).AsInt16();
Vector128<short> row6_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 18))).AsInt16();
Vector128<short> row6 = Sse2.Or(row6_G, row6_H);
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 7), 0).AsInt16();
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 1).AsInt16();
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 6), 2).AsInt16();
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 5), 3).AsInt16();
Vector128<short> row6_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 17))).AsInt16();
Vector128<short> row6_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 18))).AsInt16();
Vector128<short> row6 = row6_G | row6_H;
row6 = row6.AsUInt16().WithElement(0, rowC.AsUInt16().GetElement(7)).AsInt16();
row6 = row6.AsUInt16().WithElement(1, rowD.AsUInt16().GetElement(7)).AsInt16();
row6 = row6.AsUInt16().WithElement(2, rowE.AsUInt16().GetElement(6)).AsInt16();
row6 = row6.AsUInt16().WithElement(3, rowF.AsUInt16().GetElement(5)).AsInt16();
// row7 - F6 E7 F7 G6 H5 H6 G7 H7
Vector128<short> row7_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 19))).AsInt16();
Vector128<short> row7_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 20))).AsInt16();
Vector128<short> row7_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 21))).AsInt16();
Vector128<short> row7 = Sse2.Or(Sse2.Or(row7_F, row7_G), row7_H);
row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 7), 1).AsInt16();
Vector128<short> row7_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 19))).AsInt16();
Vector128<short> row7_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 20))).AsInt16();
Vector128<short> row7_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 21))).AsInt16();
Vector128<short> row7 = row7_F | row7_G | row7_H;
row7 = row7.AsUInt16().WithElement(1, rowE.AsUInt16().GetElement(7)).AsInt16();
block.V0 = row0;
block.V1 = row1;
@ -300,4 +303,20 @@ internal static partial class ZigZag
block.V67 = row67.AsInt16();
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> ZShuffle(Vector128<byte> source, Vector128<byte> mask)
{
// For x64 we use the SSSE3 shuffle intrinsic to avoid additional instructions. 3 vs 1.
if (Ssse3.IsSupported)
{
return Ssse3.Shuffle(source, mask);
}
// For ARM and WASM, codegen will be optimal.
return Vector128.Shuffle(source, mask);
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

Loading…
Cancel
Save