@ -3,6 +3,7 @@
#if SUPPORTS_RUNTIME_INTRINSICS
using System ;
using System.Runtime.InteropServices ;
using System.Runtime.Intrinsics ;
using System.Runtime.Intrinsics.X86 ;
@ -18,120 +19,138 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
#pragma warning restore SA1309
/// <summary>
/// Gets shuffle vectors for <see cref="ApplyZigZagOrderingSsse3"/>
/// Gets shuffle vectors for <see cref="ApplyTransposing ZigZagOrderingSsse3"/>
/// zig zag implementation.
/// </summary>
private static ReadOnlySpan < byte > SseShuffleMasks = > new byte [ ]
{
// row0
0 , 1 , 2 , 3 , _ , _ , _ , _ , _ , _ , 4 , 5 , 6 , 7 , _ , _ ,
_ , _ , _ , _ , 0 , 1 , _ , _ , 2 , 3 , _ , _ , _ , _ , 4 , 5 ,
_ , _ , _ , _ , _ , _ , 0 , 1 , _ , _ , _ , _ , _ , _ , _ , _ ,
// row1
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 8 , 9 , 1 0 , 1 1 ,
2 , 3 , _ , _ , _ , _ , _ , _ , 4 , 5 , _ , _ , _ , _ , _ , _ ,
_ , _ , 0 , 1 , _ , _ , 2 , 3 , _ , _ , _ , _ , _ , _ , _ , _ ,
// row2
_ , _ , _ , _ , _ , _ , 2 , 3 , _ , _ , _ , _ , _ , _ , 4 , 5 ,
_ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , _ , _ , 2 , 3 , _ , _ ,
// row3
_ , _ , _ , _ , _ , _ , 1 2 , 1 3 , 1 4 , 1 5 , _ , _ , _ , _ , _ , _ ,
_ , _ , _ , _ , 1 0 , 1 1 , _ , _ , _ , _ , 1 2 , 1 3 , _ , _ , _ , _ ,
_ , _ , 8 , 9 , _ , _ , _ , _ , _ , _ , _ , _ , 1 0 , 1 1 , _ , _ ,
6 , 7 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 8 , 9 ,
// row4
_ , _ , 4 , 5 , _ , _ , _ , _ , _ , _ , _ , _ , 6 , 7 , _ , _ ,
_ , _ , _ , _ , 2 , 3 , _ , _ , _ , _ , 4 , 5 , _ , _ , _ , _ ,
_ , _ , _ , _ , _ , _ , 0 , 1 , 2 , 3 , _ , _ , _ , _ , _ , _ ,
// row5
_ , _ , 1 2 , 1 3 , _ , _ , 1 4 , 1 5 , _ , _ , _ , _ , _ , _ , _ , _ ,
1 0 , 1 1 , _ , _ , _ , _ , _ , _ , 1 2 , 1 3 , _ , _ , _ , _ , _ , _ ,
// row6
_ , _ , _ , _ , _ , _ , _ , _ , 1 2 , 1 3 , _ , _ , 1 4 , 1 5 , _ , _ ,
_ , _ , _ , _ , _ , _ , 1 0 , 1 1 , _ , _ , _ , _ , _ , _ , 1 2 , 1 3 ,
4 , 5 , 6 , 7 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// row7
1 0 , 1 1 , _ , _ , _ , _ , 1 2 , 1 3 , _ , _ , 1 4 , 1 5 , _ , _ , _ , _ ,
_ , _ , 8 , 9 , 1 0 , 1 1 , _ , _ , _ , _ , _ , _ , 1 2 , 1 3 , 1 4 , 1 5
#pragma warning disable SA1515
/* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */
// A
0 , 1 , _ , _ , 2 , 3 , 4 , 5 , _ , _ , _ , _ , _ , _ , _ , _ ,
// B
_ , _ , 0 , 1 , _ , _ , _ , _ , 2 , 3 , _ , _ , _ , _ , _ , _ ,
// C
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , _ , _ , 2 , 3 ,
/* row1 - B2 A3 A4 B3 C2 D1 E0 F0 */
// A
_ , _ , 6 , 7 , 8 , 9 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// B
4 , 5 , _ , _ , _ , _ , 6 , 7 , _ , _ , _ , _ , _ , _ , _ , _ ,
/* row2 - E1 D2 C3 B4 A5 A6 B5 C4 */
// A
_ , _ , _ , _ , _ , _ , _ , _ , 1 0 , 1 1 , 1 2 , 1 3 , _ , _ , _ , _ ,
// B
_ , _ , _ , _ , _ , _ , 8 , 9 , _ , _ , _ , _ , 1 0 , 1 1 , _ , _ ,
// C
_ , _ , _ , _ , 6 , 7 , _ , _ , _ , _ , _ , _ , _ , _ , 8 , 9 ,
/* row3 - D3 E2 F1 G0 H0 G1 F2 E3 */
// E
_ , _ , 4 , 5 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 6 , 7 ,
// F
_ , _ , _ , _ , 2 , 3 , _ , _ , _ , _ , _ , _ , 4 , 5 , _ , _ ,
// G
_ , _ , _ , _ , _ , _ , 0 , 1 , _ , _ , 2 , 3 , _ , _ , _ , _ ,
/* row4 - D4 C5 B6 A7 B7 C6 D5 E4 */
// B
_ , _ , _ , _ , 1 2 , 1 3 , _ , _ , 1 4 , 1 5 , _ , _ , _ , _ , _ , _ ,
// C
_ , _ , 1 0 , 1 1 , _ , _ , _ , _ , _ , _ , 1 2 , 1 3 , _ , _ , _ , _ ,
// D
8 , 9 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 1 0 , 1 1 , _ , _ ,
/* row5 - F3 G2 H1 H2 G3 F4 E5 D6 */
// F
6 , 7 , _ , _ , _ , _ , _ , _ , _ , _ , 8 , 9 , _ , _ , _ , _ ,
// G
_ , _ , 4 , 5 , _ , _ , _ , _ , 6 , 7 , _ , _ , _ , _ , _ , _ ,
// H
_ , _ , _ , _ , 2 , 3 , 4 , 5 , _ , _ , _ , _ , _ , _ , _ , _ ,
/* row6 - C7 D7 E6 F5 G4 H3 H4 G5 */
// G
_ , _ , _ , _ , _ , _ , _ , _ , 8 , 9 , _ , _ , _ , _ , 1 0 , 1 1 ,
// H
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 6 , 7 , 8 , 9 , _ , _ ,
/* row7 - F6 E7 F7 G6 H5 H6 G7 H7 */
// F
1 2 , 1 3 , _ , _ , 1 4 , 1 5 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// G
_ , _ , _ , _ , _ , _ , 1 2 , 1 3 , _ , _ , _ , _ , 1 4 , 1 5 , _ , _ ,
// H
_ , _ , _ , _ , _ , _ , _ , _ , 1 0 , 1 1 , 1 2 , 1 3 , _ , _ , 1 4 , 1 5 ,
#pragma warning restore SA1515
} ;
/// <summary>
/// Gets shuffle vectors for <see cref="ApplyZigZagOrderingAvx2"/>
/// Gets shuffle vectors for <see cref="ApplyTransposing ZigZagOrderingAvx2"/>
/// zig zag implementation.
/// </summary>
private static ReadOnlySpan < byte > AvxShuffleMasks = > new byte [ ]
{
// 01_AB/01_EF/23_CD - cross-lane
0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 ,
// 01_AB - inner-lane
0 , 1 , 2 , 3 , 8 , 9 , _ , _ , 1 0 , 1 1 , 4 , 5 , 6 , 7 , 1 2 , 1 3 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 1 0 , 1 1 , 4 , 5 , 6 , 7 ,
// 01_CD/23_GH - cross-lane
0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , _ , _ , _ , _ , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , _ , _ , _ , _ ,
// 01_CD - inner-lane
_ , _ , _ , _ , _ , _ , 0 , 1 , _ , _ , _ , _ , _ , _ , _ , _ , 2 , 3 , 8 , 9 , _ , _ , 1 0 , 1 1 , 4 , 5 , _ , _ , _ , _ , _ , _ ,
// 01_EF - inner-lane
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// 23_AB/45_CD/67_EF - cross-lane
3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 , _ , _ , _ , _ , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 , _ , _ , _ , _ ,
// 23_AB - inner-lane
4 , 5 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 6 , 7 , 0 , 1 , 2 , 3 , 8 , 9 , _ , _ , _ , _ ,
// 23_CD - inner-lane
_ , _ , 6 , 7 , 1 2 , 1 3 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 1 0 , 1 1 , 4 , 5 , _ , _ , _ , _ , _ , _ , _ , _ , 6 , 7 , 1 2 , 1 3 ,
// 23_EF - inner-lane
_ , _ , _ , _ , _ , _ , 2 , 3 , 8 , 9 , _ , _ , 1 0 , 1 1 , 4 , 5 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// 23_GH - inner-lane
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// 45_AB - inner-lane
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 1 0 , 1 1 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// 45_CD - inner-lane
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 6 , 7 , 0 , 1 , _ , _ , 2 , 3 , 8 , 9 , _ , _ , _ , _ , _ , _ ,
// 45_EF - cross-lane
1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , _ , _ , _ , _ , 2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ,
// 45_EF - inner-lane
2 , 3 , 8 , 9 , _ , _ , _ , _ , _ , _ , _ , _ , 1 0 , 1 1 , 4 , 5 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 2 , 3 , 8 , 9 , _ , _ ,
// 45_GH - inner-lane
_ , _ , _ , _ , 2 , 3 , 8 , 9 , 1 0 , 1 1 , 4 , 5 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 6 , 7 ,
// 67_CD - inner-lane
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 1 0 , 1 1 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// 67_EF - inner-lane
_ , _ , _ , _ , _ , _ , 6 , 7 , 0 , 1 , _ , _ , 2 , 3 , 8 , 9 , _ , _ , _ , _ , _ , _ , _ , _ , 1 0 , 1 1 , _ , _ , _ , _ , _ , _ ,
// 67_GH - inner-lane
8 , 9 , 1 0 , 1 1 , 4 , 5 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 2 , 3 , 8 , 9 , 1 0 , 1 1 , 4 , 5 , _ , _ , 6 , 7 , 1 2 , 1 3 , 1 4 , 1 5
#pragma warning disable SA1515
/* 01 */
// [cr] crln_01_AB_CD
0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , _ , _ , _ , _ , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 ,
// (in) AB
0 , 1 , 8 , 9 , 2 , 3 , 4 , 5 , 1 0 , 1 1 , _ , _ , _ , _ , _ , _ , 1 2 , 1 3 , 2 , 3 , 4 , 5 , 1 4 , 1 5 , _ , _ , _ , _ , _ , _ , _ , _ ,
// (in) CD
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , 8 , 9 , 2 , 3 , _ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , 1 0 , 1 1 , _ , _ , _ , _ ,
// [cr] crln_01_23_EF_23_CD
0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 ,
// (in) EF
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , 8 , 9 ,
/* 23 */
// [cr] crln_23_AB_23_45_GH
2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 ,
// (in) AB
_ , _ , _ , _ , _ , _ , 8 , 9 , 2 , 3 , 4 , 5 , 1 0 , 1 1 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// (in) CDe
_ , _ , 1 2 , 1 3 , 6 , 7 , _ , _ , _ , _ , _ , _ , _ , _ , 8 , 9 , 1 4 , 1 5 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// (in) EF
2 , 3 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 4 , 5 , 1 0 , 1 1 , _ , _ , _ , _ , _ , _ , 1 2 , 1 3 , 6 , 7 ,
// (in) GH
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , 8 , 9 , 2 , 3 , _ , _ , _ , _ ,
/* 45 */
// (in) AB
_ , _ , _ , _ , 1 2 , 1 3 , 6 , 7 , 1 4 , 1 5 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// [cr] crln_45_67_CD_45_EF
2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ,
// (in) CD
8 , 9 , 2 , 3 , _ , _ , _ , _ , _ , _ , 4 , 5 , 1 0 , 1 1 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 1 2 , 1 3 ,
// (in) EF
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , 6 , 7 , _ , _ , _ , _ , _ , _ , _ , _ , 8 , 9 , 2 , 3 , _ , _ ,
// (in) GH
_ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , 4 , 5 , 1 0 , 1 1 , 1 2 , 1 3 , 6 , 7 , _ , _ , _ , _ , _ , _ ,
/* 67 */
// (in) CD
6 , 7 , 1 4 , 1 5 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// [cr] crln_67_EF_67_GH
2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 , _ , _ , _ , _ ,
// (in) EF
_ , _ , _ , _ , 4 , 5 , 1 4 , 1 5 , _ , _ , _ , _ , _ , _ , _ , _ , 8 , 9 , 2 , 3 , 1 0 , 1 1 , _ , _ , _ , _ , _ , _ , _ , _ , _ , _ ,
// (in) GH
_ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , 1 0 , 1 1 , 1 2 , 1 3 , 2 , 3 , _ , _ , _ , _ , _ , _ , 0 , 1 , 6 , 7 , 8 , 9 , 2 , 3 , 1 0 , 1 1 ,
#pragma warning restore SA1515
} ;
/// <summary>
/// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
/// </summary>
/// <param name="block">Input matrix.</param>
public static unsafe void ApplyZigZagOrderingSsse3 ( ref Block8x8 block )
public static unsafe void ApplyTransposing ZigZagOrderingSsse3 ( ref Block8x8 block )
{
DebugGuard . IsTrue ( Ssse3 . IsSupported , "Ssse3 support is required to run this operation!" ) ;
fixed ( byte * maskPtr = SseShuffleMasks )
fixed ( byte * shuffleVectorsPtr = & MemoryMarshal . GetReference ( SseShuffleMasks ) )
{
Vector128 < byte > rowA = block . V0 . AsByte ( ) ;
Vector128 < byte > rowB = block . V1 . AsByte ( ) ;
@ -142,73 +161,69 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
Vector128 < byte > rowG = block . V6 . AsByte ( ) ;
Vector128 < byte > rowH = block . V7 . AsByte ( ) ;
// row0 - A0 A1 B0 C0 B1 A2 A3 B2
Vector128 < short > rowA0 = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 0 ) ) ) . AsInt16 ( ) ;
Vector128 < short > rowB0 = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 1 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row0 = Sse2 . Or ( rowA0 , rowB0 ) ;
Vector128 < short > rowC0 = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 2 ) ) ) . AsInt16 ( ) ;
row0 = Sse2 . Or ( row0 , rowC0 ) ;
// row1 - C1 D0 E0 D1 C2 B3 A4 A5
Vector128 < short > rowA1 = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 3 ) ) ) . AsInt16 ( ) ;
Vector128 < short > rowC1 = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 4 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row1 = Sse2 . Or ( rowA1 , rowC1 ) ;
Vector128 < short > rowD1 = Ssse3 . Shuffle ( rowD , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 5 ) ) ) . AsInt16 ( ) ;
row1 = Sse2 . Or ( row1 , rowD1 ) ;
row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowB . AsUInt16 ( ) , 3 ) , 5 ) . AsInt16 ( ) ;
row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 0 ) , 2 ) . AsInt16 ( ) ;
// row2
Vector128 < short > rowE2 = Ssse3 . Shuffle ( rowE , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 6 ) ) ) . AsInt16 ( ) ;
Vector128 < short > rowF2 = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 7 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row2 = Sse2 . Or ( rowE2 , rowF2 ) ;
row2 = Sse2 . Insert ( row2 . AsUInt16 ( ) , Sse2 . Extract ( rowB . AsUInt16 ( ) , 4 ) , 0 ) . AsInt16 ( ) ;
row2 = Sse2 . Insert ( row2 . AsUInt16 ( ) , Sse2 . Extract ( rowC . AsUInt16 ( ) , 3 ) , 1 ) . AsInt16 ( ) ;
row2 = Sse2 . Insert ( row2 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 2 ) , 2 ) . AsInt16 ( ) ;
row2 = Sse2 . Insert ( row2 . AsUInt16 ( ) , Sse2 . Extract ( rowG . AsUInt16 ( ) , 0 ) , 5 ) . AsInt16 ( ) ;
// row3
Vector128 < short > rowA3 = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 8 ) ) ) . AsInt16 ( ) . AsInt16 ( ) ;
Vector128 < short > rowB3 = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 9 ) ) ) . AsInt16 ( ) . AsInt16 ( ) ;
Vector128 < short > row3 = Sse2 . Or ( rowA3 , rowB3 ) ;
Vector128 < short > rowC3 = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 1 0 ) ) ) . AsInt16 ( ) ;
row3 = Sse2 . Or ( row3 , rowC3 ) ;
Vector128 < byte > shuffleRowD3EF = Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 1 1 ) ) ;
Vector128 < short > rowD3 = Ssse3 . Shuffle ( rowD , shuffleRowD3EF ) . AsInt16 ( ) ;
row3 = Sse2 . Or ( row3 , rowD3 ) ;
// row4
Vector128 < short > rowE4 = Ssse3 . Shuffle ( rowE , shuffleRowD3EF ) . AsInt16 ( ) ;
Vector128 < short > rowF4 = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 1 2 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row4 = Sse2 . Or ( rowE4 , rowF4 ) ;
Vector128 < short > rowG4 = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 1 3 ) ) ) . AsInt16 ( ) ;
row4 = Sse2 . Or ( row4 , rowG4 ) ;
Vector128 < short > rowH4 = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 1 4 ) ) ) . AsInt16 ( ) ;
row4 = Sse2 . Or ( row4 , rowH4 ) ;
// row5
Vector128 < short > rowC5 = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 1 5 ) ) ) . AsInt16 ( ) ;
Vector128 < short > rowD5 = Ssse3 . Shuffle ( rowD , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 1 6 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row5 = Sse2 . Or ( rowC5 , rowD5 ) ;
row5 = Sse2 . Insert ( row5 . AsUInt16 ( ) , Sse2 . Extract ( rowB . AsUInt16 ( ) , 7 ) , 2 ) . AsInt16 ( ) ;
row5 = Sse2 . Insert ( row5 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 5 ) , 5 ) . AsInt16 ( ) ;
row5 = Sse2 . Insert ( row5 . AsUInt16 ( ) , Sse2 . Extract ( rowF . AsUInt16 ( ) , 4 ) , 6 ) . AsInt16 ( ) ;
row5 = Sse2 . Insert ( row5 . AsUInt16 ( ) , Sse2 . Extract ( rowG . AsUInt16 ( ) , 3 ) , 7 ) . AsInt16 ( ) ;
// row6
Vector128 < short > rowE6 = Ssse3 . Shuffle ( rowE , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 1 7 ) ) ) . AsInt16 ( ) ;
Vector128 < short > rowF6 = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 1 8 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row6 = Sse2 . Or ( rowE6 , rowF6 ) ;
Vector128 < short > rowH6 = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 1 9 ) ) ) . AsInt16 ( ) ;
row6 = Sse2 . Or ( row6 , rowH6 ) ;
row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 7 ) , 5 ) . AsInt16 ( ) ;
row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowG . AsUInt16 ( ) , 4 ) , 2 ) . AsInt16 ( ) ;
// row7
Vector128 < short > rowG7 = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 2 0 ) ) ) . AsInt16 ( ) ;
Vector128 < short > rowH7 = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( maskPtr + ( 1 6 * 2 1 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row7 = Sse2 . Or ( rowG7 , rowH7 ) ;
row7 = Sse2 . Insert ( row7 . AsUInt16 ( ) , Sse2 . Extract ( rowF . AsUInt16 ( ) , 7 ) , 4 ) . AsInt16 ( ) ;
// row0 - A0 B0 A1 A2 B1 C0 D0 C1
Vector128 < short > row0_A = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 0 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row0_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 1 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row0_C = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 2 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row0 = Sse2 . Or ( Sse2 . Or ( row0_A , row0_B ) , row0_C ) ;
row0 = Sse2 . Insert ( row0 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 0 ) , 6 ) . AsInt16 ( ) ;
// row1 - B2 A3 A4 B3 C2 D1 E0 F0
Vector128 < short > row1_A = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 3 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row1_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 4 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row1 = Sse2 . Or ( row1_A , row1_B ) ;
row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowC . AsUInt16 ( ) , 2 ) , 4 ) . AsInt16 ( ) ;
row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 1 ) , 5 ) . AsInt16 ( ) ;
row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 0 ) , 6 ) . AsInt16 ( ) ;
row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowF . AsUInt16 ( ) , 0 ) , 7 ) . AsInt16 ( ) ;
// row2 - E1 D2 C3 B4 A5 A6 B5 C4
Vector128 < short > row2_A = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 5 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row2_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 6 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row2_C = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 7 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row2 = Sse2 . Or ( Sse2 . Or ( row2_A , row2_B ) , row2_C ) ;
row2 = Sse2 . Insert ( row2 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 2 ) , 1 ) . AsInt16 ( ) ;
row2 = Sse2 . Insert ( row2 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 1 ) , 0 ) . AsInt16 ( ) ;
// row3 - D3 E2 F1 G0 H0 G1 F2 E3
Vector128 < short > row3_E = Ssse3 . Shuffle ( rowE , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 8 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row3_F = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 9 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row3_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 1 0 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row3 = Sse2 . Or ( Sse2 . Or ( row3_E , row3_F ) , row3_G ) ;
row3 = Sse2 . Insert ( row3 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 3 ) , 0 ) . AsInt16 ( ) ;
row3 = Sse2 . Insert ( row3 . AsUInt16 ( ) , Sse2 . Extract ( rowH . AsUInt16 ( ) , 0 ) , 4 ) . AsInt16 ( ) ;
// row4 - D4 C5 B6 A7 B7 C6 D5 E4
Vector128 < short > row4_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 1 1 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row4_C = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 1 2 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row4_D = Ssse3 . Shuffle ( rowD , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 1 3 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row4 = Sse2 . Or ( Sse2 . Or ( row4_B , row4_C ) , row4_D ) ;
row4 = Sse2 . Insert ( row4 . AsUInt16 ( ) , Sse2 . Extract ( rowA . AsUInt16 ( ) , 7 ) , 3 ) . AsInt16 ( ) ;
row4 = Sse2 . Insert ( row4 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 4 ) , 7 ) . AsInt16 ( ) ;
// row5 - F3 G2 H1 H2 G3 F4 E5 D6
Vector128 < short > row5_F = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 1 4 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row5_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 1 5 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row5_H = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 1 6 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row5 = Sse2 . Or ( Sse2 . Or ( row5_F , row5_G ) , row5_H ) ;
row5 = Sse2 . Insert ( row5 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 6 ) , 7 ) . AsInt16 ( ) ;
row5 = Sse2 . Insert ( row5 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 5 ) , 6 ) . AsInt16 ( ) ;
// row6 - C7 D7 E6 F5 G4 H3 H4 G5
Vector128 < short > row6_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 1 7 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row6_H = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 1 8 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row6 = Sse2 . Or ( row6_G , row6_H ) ;
row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowC . AsUInt16 ( ) , 7 ) , 0 ) . AsInt16 ( ) ;
row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 7 ) , 1 ) . AsInt16 ( ) ;
row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 6 ) , 2 ) . AsInt16 ( ) ;
row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowF . AsUInt16 ( ) , 5 ) , 3 ) . AsInt16 ( ) ;
// row7 - F6 E7 F7 G6 H5 H6 G7 H7
Vector128 < short > row7_F = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 1 9 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row7_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 2 0 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row7_H = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 1 6 * 2 1 ) ) ) . AsInt16 ( ) ;
Vector128 < short > row7 = Sse2 . Or ( Sse2 . Or ( row7_F , row7_G ) , row7_H ) ;
row7 = Sse2 . Insert ( row7 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 7 ) , 1 ) . AsInt16 ( ) ;
block . V0 = row0 ;
block . V1 = row1 ;
@ -225,69 +240,61 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
/// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics.
/// </summary>
/// <param name="block">Input matrix.</param>
public static unsafe void ApplyZigZagOrderingAvx2 ( ref Block8x8 block )
public static unsafe void ApplyTransposing ZigZagOrderingAvx2 ( ref Block8x8 block )
{
DebugGuard . IsTrue ( Avx2 . IsSupported , "Avx2 support is required to run this operation!" ) ;
fixed ( byte * shuffleVectorsPtr = AvxShuffleMasks )
fixed ( byte * shuffleVectorsPtr = & MemoryMarshal . GetReference ( AvxShuffleMasks ) )
{
Vector256 < byte > rows AB = block . V01 . AsByte ( ) ;
Vector256 < byte > rows CD = block . V23 . AsByte ( ) ;
Vector256 < byte > rows EF = block . V45 . AsByte ( ) ;
Vector256 < byte > rows GH = block . V67 . AsByte ( ) ;
// rows 0 1
Vector256 < int > rows_AB01_EF01_CD23_shuffleMask = Avx . LoadVector256 ( shuffleVectorsPtr + ( 0 * 3 2 ) ) . AsInt32 ( ) ;
Vector256 < byte > row01_AB = Avx2 . PermuteVar8x32 ( rows AB . AsInt32 ( ) , rows_AB01_EF01_CD23_shuffleMask ) . AsByte ( ) ;
Vector256 < byte > rowAB = block . V01 . AsByte ( ) ;
Vector256 < byte > rowCD = block . V23 . AsByte ( ) ;
Vector256 < byte > rowEF = block . V45 . AsByte ( ) ;
Vector256 < byte > rowGH = block . V67 . AsByte ( ) ;
/* row01 - A0 B0 A1 A2 B1 C0 D0 C1 | B2 A3 A4 B3 C2 D1 E0 F0 */
Vector256 < int > crln_01_AB_CD = Avx . LoadVector256 ( shuffleVectorsPtr + ( 0 * 3 2 ) ) . AsInt32 ( ) ;
Vector256 < byte > row01_AB = Avx2 . PermuteVar8x32 ( rowAB . AsInt32 ( ) , crln_01_AB_CD ) . AsByte ( ) ;
row01_AB = Avx2 . Shuffle ( row01_AB , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < int > rows_CD01_GH23_shuffleMask = Avx . LoadVector256 ( shuffleVectorsPtr + ( 2 * 3 2 ) ) . AsInt32 ( ) ;
Vector256 < byte > row01_CD = Avx2 . PermuteVar8x32 ( rowsCD . AsInt32 ( ) , rows_CD01_GH23_shuffleMask ) . AsByte ( ) ;
row01_CD = Avx2 . Shuffle ( row01_CD , Avx . LoadVector256 ( shuffleVectorsPtr + ( 3 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row0123_EF = Avx2 . PermuteVar8x32 ( rowsEF . AsInt32 ( ) , rows_AB01_EF01_CD23_shuffleMask ) . AsByte ( ) ;
Vector256 < byte > row01_EF = Avx2 . Shuffle ( row0123_EF , Avx . LoadVector256 ( shuffleVectorsPtr + ( 4 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row01 = Avx2 . Or ( Avx2 . Or ( row01_AB , row01_CD ) , row01_EF ) ;
// rows 2 3
Vector256 < int > rows_AB23_CD45_EF67_shuffleMask = Avx . LoadVector256 ( shuffleVectorsPtr + ( 5 * 3 2 ) ) . AsInt32 ( ) ;
Vector256 < byte > row2345_AB = Avx2 . PermuteVar8x32 ( rowsAB . AsInt32 ( ) , rows_AB23_CD45_EF67_shuffleMask ) . AsByte ( ) ;
Vector256 < byte > row23_AB = Avx2 . Shuffle ( row2345_AB , Avx . LoadVector256 ( shuffleVectorsPtr + ( 6 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row23_CD = Avx2 . PermuteVar8x32 ( rowsCD . AsInt32 ( ) , rows_AB01_EF01_CD23_shuffleMask ) . AsByte ( ) ;
Vector256 < byte > row01_CD = Avx2 . PermuteVar8x32 ( rowCD . AsInt32 ( ) , crln_01_AB_CD ) . AsByte ( ) ;
row01_CD = Avx2 . Shuffle ( row01_CD , Avx . LoadVector256 ( shuffleVectorsPtr + ( 2 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < int > crln_01_23_EF_23_CD = Avx . LoadVector256 ( shuffleVectorsPtr + ( 3 * 3 2 ) ) . AsInt32 ( ) ;
Vector256 < byte > row01_23_EF = Avx2 . PermuteVar8x32 ( rowEF . AsInt32 ( ) , crln_01_23_EF_23_CD ) . AsByte ( ) ;
Vector256 < byte > row01_EF = Avx2 . Shuffle ( row01_23_EF , Avx . LoadVector256 ( shuffleVectorsPtr + ( 4 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row01 = Avx2 . Or ( row01_AB , Avx2 . Or ( row01_CD , row01_EF ) ) ;
/* row23 - E1 D2 C3 B4 A5 A6 B5 C4 | D3 E2 F1 G0 H0 G1 F2 E3 */
Vector256 < int > crln_23_AB_23_45_GH = Avx . LoadVector256 ( shuffleVectorsPtr + ( 5 * 3 2 ) ) . AsInt32 ( ) ;
Vector256 < byte > row23_45_AB = Avx2 . PermuteVar8x32 ( rowAB . AsInt32 ( ) , crln_23_AB_23_45_GH ) . AsByte ( ) ;
Vector256 < byte > row23_AB = Avx2 . Shuffle ( row23_45_AB , Avx . LoadVector256 ( shuffleVectorsPtr + ( 6 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row23_CD = Avx2 . PermuteVar8x32 ( rowCD . AsInt32 ( ) , crln_01_23_EF_23_CD ) . AsByte ( ) ;
row23_CD = Avx2 . Shuffle ( row23_CD , Avx . LoadVector256 ( shuffleVectorsPtr + ( 7 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row23_EF = Avx2 . Shuffle ( row0123_EF , Avx . LoadVector256 ( shuffleVectorsPtr + ( 8 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row2345_GH = Avx2 . PermuteVar8x32 ( rowsGH . AsInt32 ( ) , rows_CD01_GH23_shuffleMask ) . AsByte ( ) ;
Vector256 < byte > row23_GH = Avx2 . Shuffle ( row2345_GH , Avx . LoadVector256 ( shuffleVectorsPtr + ( 9 * 3 2 ) ) . AsByte ( ) ) ;
Vector256 < byte > row23_EF = Avx2 . Shuffle ( row01_23_EF , Avx . LoadVector256 ( shuffleVectorsPtr + ( 8 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row23_45_GH = Avx2 . PermuteVar8x32 ( rowGH . AsInt32 ( ) , crln_23_AB_23_45_GH ) . AsByte ( ) ;
Vector256 < byte > row23_GH = Avx2 . Shuffle ( row23_45_GH , Avx . LoadVector256 ( shuffleVectorsPtr + ( 9 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row23 = Avx2 . Or ( Avx2 . Or ( row23_AB , row23_CD ) , Avx2 . Or ( row23_EF , row23_GH ) ) ;
// rows 4 5
Vector256 < byte > row45_AB = Avx2 . Shuffle ( row2345_AB , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 0 * 3 2 ) ) . AsByte ( ) ) ;
Vector256 < byte > row4567_CD = Avx2 . PermuteVar8x32 ( rowsCD . AsInt32 ( ) , rows_AB23_CD45_EF67_shuffleMask ) . AsByte ( ) ;
Vector256 < byte > row45_CD = Avx2 . Shuffle ( row4567_CD , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 1 * 3 2 ) ) . AsByte ( ) ) ;
Vector256 < int > rows_EF45_GH67_shuffleMask = Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 2 * 3 2 ) ) . AsInt32 ( ) ;
Vector256 < byte > row45_EF = Avx2 . PermuteVar8x32 ( rowsEF . AsInt32 ( ) , rows_EF45_GH67_shuffleMask ) . AsByte ( ) ;
row45_EF = Avx2 . Shuffle ( row45_EF , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 3 * 3 2 ) ) . AsByte ( ) ) ;
Vector256 < byte > row45_GH = Avx2 . Shuffle ( row2345_GH , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 4 * 3 2 ) ) . AsByte ( ) ) ;
/* row45 - D4 C5 B6 A7 B7 C6 D5 E4 | F3 G2 H1 H2 G3 F4 E5 D6 */
Vector256 < byte > row45_AB = Avx2 . Shuffle ( row23_45_AB , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 0 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < int > crln_45_67_CD_45_EF = Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 1 * 3 2 ) ) . AsInt32 ( ) ;
Vector256 < byte > row45_67_CD = Avx2 . PermuteVar8x32 ( rowCD . AsInt32 ( ) , crln_45_67_CD_45_EF ) . AsByte ( ) ;
Vector256 < byte > row45_CD = Avx2 . Shuffle ( row45_67_CD , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 2 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row45_EF = Avx2 . PermuteVar8x32 ( rowEF . AsInt32 ( ) , crln_45_67_CD_45_EF ) . AsByte ( ) ;
row45_EF = Avx2 . Shuffle ( row45_EF , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 3 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row45_GH = Avx2 . Shuffle ( row23_45_GH , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 4 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row45 = Avx2 . Or ( Avx2 . Or ( row45_AB , row45_CD ) , Avx2 . Or ( row45_EF , row45_GH ) ) ;
// rows 6 7
Vector256 < byte > row67_CD = Avx2 . Shuffle ( row4567_CD , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 5 * 3 2 ) ) . AsByte ( ) ) ;
Vector256 < byte > row67_EF = Avx2 . PermuteVar8x32 ( rowsEF . AsInt32 ( ) , rows_AB23_CD45_EF67_shuffleMask ) . AsByte ( ) ;
row67_EF = Avx2 . Shuffle ( row67_EF , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 6 * 3 2 ) ) . AsByte ( ) ) ;
Vector256 < byte > row67_GH = Avx2 . PermuteVar8x32 ( rowsGH . AsInt32 ( ) , rows_EF45_GH67_shuffleMask ) . AsByte ( ) ;
row67_GH = Avx2 . Shuffle ( row67_GH , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 7 * 3 2 ) ) . AsByte ( ) ) ;
/* row67 - C7 D7 E6 F5 G4 H3 H4 G5 | F6 E7 F7 G6 H5 H6 G7 H7 */
Vector256 < byte > row67_CD = Avx2 . Shuffle ( row45_67_CD , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 5 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < int > crln_67_EF_67_GH = Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 6 * 3 2 ) ) . AsInt32 ( ) ;
Vector256 < byte > row67_EF = Avx2 . PermuteVar8x32 ( rowEF . AsInt32 ( ) , crln_67_EF_67_GH ) . AsByte ( ) ;
row67_EF = Avx2 . Shuffle ( row67_EF , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 7 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row67_GH = Avx2 . PermuteVar8x32 ( rowGH . AsInt32 ( ) , crln_67_EF_67_GH ) . AsByte ( ) ;
row67_GH = Avx2 . Shuffle ( row67_GH , Avx . LoadVector256 ( shuffleVectorsPtr + ( 1 8 * 3 2 ) ) ) . AsByte ( ) ;
Vector256 < byte > row67 = Avx2 . Or ( Avx2 . Or ( row67_CD , row67_EF ) , row67_GH ) ;
Vector256 < byte > row67 = Avx2 . Or ( row67_CD , Avx2 . Or ( row67_EF , row67_GH ) ) ;
block . V01 = row01 . AsInt16 ( ) ;
block . V23 = row23 . AsInt16 ( ) ;