mirror of https://github.com/SixLabors/ImageSharp
committed by
GitHub
35 changed files with 1104 additions and 970 deletions
@ -1,153 +0,0 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Six Labors Split License.
|
|||
|
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
|
|||
// <auto-generated />
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
internal partial struct Block8x8F |
|||
{ |
|||
/// <summary>
|
|||
/// Level shift by +maximum/2, clip to [0, maximum]
|
|||
/// </summary>
|
|||
public void NormalizeColorsInPlace(float maximum) |
|||
{ |
|||
var CMin4 = new Vector4(0F); |
|||
var CMax4 = new Vector4(maximum); |
|||
var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F)); |
|||
|
|||
this.V0L = Numerics.Clamp(this.V0L + COff4, CMin4, CMax4); |
|||
this.V0R = Numerics.Clamp(this.V0R + COff4, CMin4, CMax4); |
|||
this.V1L = Numerics.Clamp(this.V1L + COff4, CMin4, CMax4); |
|||
this.V1R = Numerics.Clamp(this.V1R + COff4, CMin4, CMax4); |
|||
this.V2L = Numerics.Clamp(this.V2L + COff4, CMin4, CMax4); |
|||
this.V2R = Numerics.Clamp(this.V2R + COff4, CMin4, CMax4); |
|||
this.V3L = Numerics.Clamp(this.V3L + COff4, CMin4, CMax4); |
|||
this.V3R = Numerics.Clamp(this.V3R + COff4, CMin4, CMax4); |
|||
this.V4L = Numerics.Clamp(this.V4L + COff4, CMin4, CMax4); |
|||
this.V4R = Numerics.Clamp(this.V4R + COff4, CMin4, CMax4); |
|||
this.V5L = Numerics.Clamp(this.V5L + COff4, CMin4, CMax4); |
|||
this.V5R = Numerics.Clamp(this.V5R + COff4, CMin4, CMax4); |
|||
this.V6L = Numerics.Clamp(this.V6L + COff4, CMin4, CMax4); |
|||
this.V6R = Numerics.Clamp(this.V6R + COff4, CMin4, CMax4); |
|||
this.V7L = Numerics.Clamp(this.V7L + COff4, CMin4, CMax4); |
|||
this.V7R = Numerics.Clamp(this.V7R + COff4, CMin4, CMax4); |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// AVX2-only variant for executing <see cref="NormalizeColorsInPlace"/> and <see cref="RoundInPlace"/> in one step.
|
|||
/// </summary>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void NormalizeColorsAndRoundInPlaceVector8(float maximum) |
|||
{ |
|||
var off = new Vector<float>(MathF.Ceiling(maximum * 0.5F)); |
|||
var max = new Vector<float>(maximum); |
|||
|
|||
ref Vector<float> row0 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V0L); |
|||
row0 = NormalizeAndRound(row0, off, max); |
|||
|
|||
ref Vector<float> row1 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V1L); |
|||
row1 = NormalizeAndRound(row1, off, max); |
|||
|
|||
ref Vector<float> row2 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V2L); |
|||
row2 = NormalizeAndRound(row2, off, max); |
|||
|
|||
ref Vector<float> row3 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V3L); |
|||
row3 = NormalizeAndRound(row3, off, max); |
|||
|
|||
ref Vector<float> row4 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V4L); |
|||
row4 = NormalizeAndRound(row4, off, max); |
|||
|
|||
ref Vector<float> row5 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V5L); |
|||
row5 = NormalizeAndRound(row5, off, max); |
|||
|
|||
ref Vector<float> row6 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V6L); |
|||
row6 = NormalizeAndRound(row6, off, max); |
|||
|
|||
ref Vector<float> row7 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V7L); |
|||
row7 = NormalizeAndRound(row7, off, max); |
|||
|
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Fill the block from 'source' doing short -> float conversion.
|
|||
/// </summary>
|
|||
public void LoadFromInt16Scalar(ref Block8x8 source) |
|||
{ |
|||
ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source); |
|||
|
|||
this.V0L.X = Unsafe.Add(ref selfRef, 0); |
|||
this.V0L.Y = Unsafe.Add(ref selfRef, 1); |
|||
this.V0L.Z = Unsafe.Add(ref selfRef, 2); |
|||
this.V0L.W = Unsafe.Add(ref selfRef, 3); |
|||
this.V0R.X = Unsafe.Add(ref selfRef, 4); |
|||
this.V0R.Y = Unsafe.Add(ref selfRef, 5); |
|||
this.V0R.Z = Unsafe.Add(ref selfRef, 6); |
|||
this.V0R.W = Unsafe.Add(ref selfRef, 7); |
|||
|
|||
this.V1L.X = Unsafe.Add(ref selfRef, 8); |
|||
this.V1L.Y = Unsafe.Add(ref selfRef, 9); |
|||
this.V1L.Z = Unsafe.Add(ref selfRef, 10); |
|||
this.V1L.W = Unsafe.Add(ref selfRef, 11); |
|||
this.V1R.X = Unsafe.Add(ref selfRef, 12); |
|||
this.V1R.Y = Unsafe.Add(ref selfRef, 13); |
|||
this.V1R.Z = Unsafe.Add(ref selfRef, 14); |
|||
this.V1R.W = Unsafe.Add(ref selfRef, 15); |
|||
|
|||
this.V2L.X = Unsafe.Add(ref selfRef, 16); |
|||
this.V2L.Y = Unsafe.Add(ref selfRef, 17); |
|||
this.V2L.Z = Unsafe.Add(ref selfRef, 18); |
|||
this.V2L.W = Unsafe.Add(ref selfRef, 19); |
|||
this.V2R.X = Unsafe.Add(ref selfRef, 20); |
|||
this.V2R.Y = Unsafe.Add(ref selfRef, 21); |
|||
this.V2R.Z = Unsafe.Add(ref selfRef, 22); |
|||
this.V2R.W = Unsafe.Add(ref selfRef, 23); |
|||
|
|||
this.V3L.X = Unsafe.Add(ref selfRef, 24); |
|||
this.V3L.Y = Unsafe.Add(ref selfRef, 25); |
|||
this.V3L.Z = Unsafe.Add(ref selfRef, 26); |
|||
this.V3L.W = Unsafe.Add(ref selfRef, 27); |
|||
this.V3R.X = Unsafe.Add(ref selfRef, 28); |
|||
this.V3R.Y = Unsafe.Add(ref selfRef, 29); |
|||
this.V3R.Z = Unsafe.Add(ref selfRef, 30); |
|||
this.V3R.W = Unsafe.Add(ref selfRef, 31); |
|||
|
|||
this.V4L.X = Unsafe.Add(ref selfRef, 32); |
|||
this.V4L.Y = Unsafe.Add(ref selfRef, 33); |
|||
this.V4L.Z = Unsafe.Add(ref selfRef, 34); |
|||
this.V4L.W = Unsafe.Add(ref selfRef, 35); |
|||
this.V4R.X = Unsafe.Add(ref selfRef, 36); |
|||
this.V4R.Y = Unsafe.Add(ref selfRef, 37); |
|||
this.V4R.Z = Unsafe.Add(ref selfRef, 38); |
|||
this.V4R.W = Unsafe.Add(ref selfRef, 39); |
|||
|
|||
this.V5L.X = Unsafe.Add(ref selfRef, 40); |
|||
this.V5L.Y = Unsafe.Add(ref selfRef, 41); |
|||
this.V5L.Z = Unsafe.Add(ref selfRef, 42); |
|||
this.V5L.W = Unsafe.Add(ref selfRef, 43); |
|||
this.V5R.X = Unsafe.Add(ref selfRef, 44); |
|||
this.V5R.Y = Unsafe.Add(ref selfRef, 45); |
|||
this.V5R.Z = Unsafe.Add(ref selfRef, 46); |
|||
this.V5R.W = Unsafe.Add(ref selfRef, 47); |
|||
|
|||
this.V6L.X = Unsafe.Add(ref selfRef, 48); |
|||
this.V6L.Y = Unsafe.Add(ref selfRef, 49); |
|||
this.V6L.Z = Unsafe.Add(ref selfRef, 50); |
|||
this.V6L.W = Unsafe.Add(ref selfRef, 51); |
|||
this.V6R.X = Unsafe.Add(ref selfRef, 52); |
|||
this.V6R.Y = Unsafe.Add(ref selfRef, 53); |
|||
this.V6R.Z = Unsafe.Add(ref selfRef, 54); |
|||
this.V6R.W = Unsafe.Add(ref selfRef, 55); |
|||
|
|||
this.V7L.X = Unsafe.Add(ref selfRef, 56); |
|||
this.V7L.Y = Unsafe.Add(ref selfRef, 57); |
|||
this.V7L.Z = Unsafe.Add(ref selfRef, 58); |
|||
this.V7L.W = Unsafe.Add(ref selfRef, 59); |
|||
this.V7R.X = Unsafe.Add(ref selfRef, 60); |
|||
this.V7R.Y = Unsafe.Add(ref selfRef, 61); |
|||
this.V7R.Z = Unsafe.Add(ref selfRef, 62); |
|||
this.V7R.W = Unsafe.Add(ref selfRef, 63); |
|||
} |
|||
} |
|||
@ -1,103 +0,0 @@ |
|||
<# |
|||
// Copyright (c) Six Labors. |
|||
// Licensed under the Six Labors Split License. |
|||
#> |
|||
<#@ template debug="false" hostspecific="false" language="C#" #> |
|||
<#@ assembly name="System.Core" #> |
|||
<#@ import namespace="System.Linq" #> |
|||
<#@ import namespace="System.Text" #> |
|||
<#@ import namespace="System.Collections.Generic" #> |
|||
<#@ output extension=".cs" #> |
|||
// Copyright (c) Six Labors. |
|||
// Licensed under the Six Labors Split License. |
|||
|
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
|
|||
// <auto-generated /> |
|||
<# |
|||
char[] coordz = {'X', 'Y', 'Z', 'W'}; |
|||
#> |
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
internal partial struct Block8x8F |
|||
{ |
|||
/// <summary> |
|||
/// Level shift by +maximum/2, clip to [0, maximum] |
|||
/// </summary> |
|||
public void NormalizeColorsInPlace(float maximum) |
|||
{ |
|||
var CMin4 = new Vector4(0F); |
|||
var CMax4 = new Vector4(maximum); |
|||
var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F)); |
|||
|
|||
<# |
|||
|
|||
PushIndent(" "); |
|||
|
|||
for (int i = 0; i < 8; i++) |
|||
{ |
|||
for (int j = 0; j < 2; j++) |
|||
{ |
|||
char side = j == 0 ? 'L' : 'R'; |
|||
Write($"this.V{i}{side} = Numerics.Clamp(this.V{i}{side} + COff4, CMin4, CMax4);\r\n"); |
|||
} |
|||
} |
|||
PopIndent(); |
|||
#> |
|||
} |
|||
|
|||
/// <summary> |
|||
/// AVX2-only variant for executing <see cref="NormalizeColorsInPlace"/> and <see cref="RoundInPlace"/> in one step. |
|||
/// </summary> |
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void NormalizeColorsAndRoundInPlaceVector8(float maximum) |
|||
{ |
|||
var off = new Vector<float>(MathF.Ceiling(maximum * 0.5F)); |
|||
var max = new Vector<float>(maximum); |
|||
<# |
|||
|
|||
for (int i = 0; i < 8; i++) |
|||
{ |
|||
#> |
|||
|
|||
ref Vector<float> row<#=i#> = ref Unsafe.As<Vector4, Vector<float>>(ref this.V<#=i#>L); |
|||
row<#=i#> = NormalizeAndRound(row<#=i#>, off, max); |
|||
<# |
|||
} |
|||
#> |
|||
|
|||
} |
|||
|
|||
/// <summary> |
|||
/// Fill the block from 'source' doing short -> float conversion. |
|||
/// </summary> |
|||
public void LoadFromInt16Scalar(ref Block8x8 source) |
|||
{ |
|||
ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source); |
|||
|
|||
<# |
|||
PushIndent(" "); |
|||
for (int j = 0; j < 8; j++) |
|||
{ |
|||
for (int i = 0; i < 8; i++) |
|||
{ |
|||
char destCoord = coordz[i % 4]; |
|||
char destSide = (i / 4) % 2 == 0 ? 'L' : 'R'; |
|||
|
|||
if(j > 0 && i == 0){ |
|||
WriteLine(""); |
|||
} |
|||
|
|||
char srcCoord = coordz[j % 4]; |
|||
char srcSide = (j / 4) % 2 == 0 ? 'L' : 'R'; |
|||
|
|||
var expression = $"this.V{j}{destSide}.{destCoord} = Unsafe.Add(ref selfRef, {j*8+i});\r\n"; |
|||
Write(expression); |
|||
|
|||
} |
|||
} |
|||
PopIndent(); |
|||
#> |
|||
} |
|||
} |
|||
@ -1,144 +0,0 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Six Labors Split License.
|
|||
|
|||
using System.Numerics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
internal partial struct Block8x8F |
|||
{ |
|||
/// <summary>
|
|||
/// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
|
|||
/// </summary>
|
|||
public const int RowCount = 8; |
|||
|
|||
[FieldOffset(0)] |
|||
public Vector256<float> V0; |
|||
[FieldOffset(32)] |
|||
public Vector256<float> V1; |
|||
[FieldOffset(64)] |
|||
public Vector256<float> V2; |
|||
[FieldOffset(96)] |
|||
public Vector256<float> V3; |
|||
[FieldOffset(128)] |
|||
public Vector256<float> V4; |
|||
[FieldOffset(160)] |
|||
public Vector256<float> V5; |
|||
[FieldOffset(192)] |
|||
public Vector256<float> V6; |
|||
[FieldOffset(224)] |
|||
public Vector256<float> V7; |
|||
|
|||
private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) |
|||
{ |
|||
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); |
|||
|
|||
ref Vector256<float> aBase = ref a.V0; |
|||
ref Vector256<float> bBase = ref b.V0; |
|||
|
|||
ref Vector256<short> destRef = ref dest.V01; |
|||
Vector256<int> multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); |
|||
|
|||
for (nuint i = 0; i < 8; i += 2) |
|||
{ |
|||
Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); |
|||
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); |
|||
|
|||
Vector256<short> row = Avx2.PackSignedSaturate(row0, row1); |
|||
row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16(); |
|||
|
|||
Unsafe.Add(ref destRef, i / 2) = row; |
|||
} |
|||
} |
|||
|
|||
private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) |
|||
{ |
|||
DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!"); |
|||
|
|||
ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a); |
|||
ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b); |
|||
|
|||
ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest); |
|||
|
|||
for (nuint i = 0; i < 16; i += 2) |
|||
{ |
|||
Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); |
|||
Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); |
|||
|
|||
Vector128<short> row = Sse2.PackSignedSaturate(left, right); |
|||
Unsafe.Add(ref destBase, i / 2) = row; |
|||
} |
|||
} |
|||
|
|||
private void TransposeInplace_Avx() |
|||
{ |
|||
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
|
|||
Vector256<float> r0 = Avx.InsertVector128( |
|||
this.V0, |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L), |
|||
1); |
|||
|
|||
Vector256<float> r1 = Avx.InsertVector128( |
|||
this.V1, |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V5L), |
|||
1); |
|||
|
|||
Vector256<float> r2 = Avx.InsertVector128( |
|||
this.V2, |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V6L), |
|||
1); |
|||
|
|||
Vector256<float> r3 = Avx.InsertVector128( |
|||
this.V3, |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V7L), |
|||
1); |
|||
|
|||
Vector256<float> r4 = Avx.InsertVector128( |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(), |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V4R), |
|||
1); |
|||
|
|||
Vector256<float> r5 = Avx.InsertVector128( |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(), |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V5R), |
|||
1); |
|||
|
|||
Vector256<float> r6 = Avx.InsertVector128( |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(), |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V6R), |
|||
1); |
|||
|
|||
Vector256<float> r7 = Avx.InsertVector128( |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(), |
|||
Unsafe.As<Vector4, Vector128<float>>(ref this.V7R), |
|||
1); |
|||
|
|||
Vector256<float> t0 = Avx.UnpackLow(r0, r1); |
|||
Vector256<float> t2 = Avx.UnpackLow(r2, r3); |
|||
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E); |
|||
this.V0 = Avx.Blend(t0, v, 0xCC); |
|||
this.V1 = Avx.Blend(t2, v, 0x33); |
|||
|
|||
Vector256<float> t4 = Avx.UnpackLow(r4, r5); |
|||
Vector256<float> t6 = Avx.UnpackLow(r6, r7); |
|||
v = Avx.Shuffle(t4, t6, 0x4E); |
|||
this.V4 = Avx.Blend(t4, v, 0xCC); |
|||
this.V5 = Avx.Blend(t6, v, 0x33); |
|||
|
|||
Vector256<float> t1 = Avx.UnpackHigh(r0, r1); |
|||
Vector256<float> t3 = Avx.UnpackHigh(r2, r3); |
|||
v = Avx.Shuffle(t1, t3, 0x4E); |
|||
this.V2 = Avx.Blend(t1, v, 0xCC); |
|||
this.V3 = Avx.Blend(t3, v, 0x33); |
|||
|
|||
Vector256<float> t5 = Avx.UnpackHigh(r4, r5); |
|||
Vector256<float> t7 = Avx.UnpackHigh(r6, r7); |
|||
v = Avx.Shuffle(t5, t7, 0x4E); |
|||
this.V6 = Avx.Blend(t5, v, 0xCC); |
|||
this.V7 = Avx.Blend(t7, v, 0x33); |
|||
} |
|||
} |
|||
@ -0,0 +1,93 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Six Labors Split License.
|
|||
|
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.Intrinsics; |
|||
using SixLabors.ImageSharp.Common.Helpers; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
/// <content>
|
|||
/// <see cref="Vector128{Single}"/> version of <see cref="Block8x8F"/>.
|
|||
/// </content>
|
|||
internal partial struct Block8x8F |
|||
{ |
|||
/// <summary>
|
|||
/// <see cref="Vector128{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
|
|||
/// </summary>
|
|||
/// <param name="maximum">The maximum value to normalize to.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void NormalizeColorsAndRoundInPlaceVector128(float maximum) |
|||
{ |
|||
Vector128<float> max = Vector128.Create(maximum); |
|||
Vector128<float> off = Vector128.Ceiling(max * .5F); |
|||
|
|||
this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4(); |
|||
this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4(); |
|||
this.V1L = NormalizeAndRoundVector128(this.V1L.AsVector128(), off, max).AsVector4(); |
|||
this.V1R = NormalizeAndRoundVector128(this.V1R.AsVector128(), off, max).AsVector4(); |
|||
this.V2L = NormalizeAndRoundVector128(this.V2L.AsVector128(), off, max).AsVector4(); |
|||
this.V2R = NormalizeAndRoundVector128(this.V2R.AsVector128(), off, max).AsVector4(); |
|||
this.V3L = NormalizeAndRoundVector128(this.V3L.AsVector128(), off, max).AsVector4(); |
|||
this.V3R = NormalizeAndRoundVector128(this.V3R.AsVector128(), off, max).AsVector4(); |
|||
this.V4L = NormalizeAndRoundVector128(this.V4L.AsVector128(), off, max).AsVector4(); |
|||
this.V4R = NormalizeAndRoundVector128(this.V4R.AsVector128(), off, max).AsVector4(); |
|||
this.V5L = NormalizeAndRoundVector128(this.V5L.AsVector128(), off, max).AsVector4(); |
|||
this.V5R = NormalizeAndRoundVector128(this.V5R.AsVector128(), off, max).AsVector4(); |
|||
this.V6L = NormalizeAndRoundVector128(this.V6L.AsVector128(), off, max).AsVector4(); |
|||
this.V6R = NormalizeAndRoundVector128(this.V6R.AsVector128(), off, max).AsVector4(); |
|||
this.V7L = NormalizeAndRoundVector128(this.V7L.AsVector128(), off, max).AsVector4(); |
|||
this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4(); |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
|
|||
/// </summary>
|
|||
/// <param name="source">The source <see cref="Block8x8"/></param>
|
|||
public void LoadFromInt16ExtendedVector128(ref Block8x8 source) |
|||
{ |
|||
DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!"); |
|||
|
|||
ref Vector128<short> srcBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref source); |
|||
ref Vector128<float> destBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref this); |
|||
|
|||
// Only 8 iterations, one per 128b short block
|
|||
for (nuint i = 0; i < 8; i++) |
|||
{ |
|||
Vector128<short> src = Unsafe.Add(ref srcBase, i); |
|||
|
|||
// Step 1: Widen short -> int
|
|||
Vector128<int> lower = Vector128.WidenLower(src); // lower 4 shorts -> 4 ints
|
|||
Vector128<int> upper = Vector128.WidenUpper(src); // upper 4 shorts -> 4 ints
|
|||
|
|||
// Step 2: Convert int -> float
|
|||
Vector128<float> lowerF = Vector128.ConvertToSingle(lower); |
|||
Vector128<float> upperF = Vector128.ConvertToSingle(upper); |
|||
|
|||
// Step 3: Store to destination (this is 16 lanes -> two Vector128<float> blocks)
|
|||
Unsafe.Add(ref destBase, (i * 2) + 0) = lowerF; |
|||
Unsafe.Add(ref destBase, (i * 2) + 1) = upperF; |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static Vector128<float> NormalizeAndRoundVector128(Vector128<float> value, Vector128<float> off, Vector128<float> max) |
|||
=> Vector128_.RoundToNearestInteger(Vector128_.Clamp(value + off, Vector128<float>.Zero, max)); |
|||
|
|||
private static void MultiplyIntoInt16Vector128(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) |
|||
{ |
|||
DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!"); |
|||
|
|||
ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a); |
|||
ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b); |
|||
ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest); |
|||
|
|||
for (nuint i = 0; i < 16; i += 2) |
|||
{ |
|||
Vector128<int> left = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0)); |
|||
Vector128<int> right = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1)); |
|||
|
|||
Unsafe.Add(ref destBase, i / 2) = Vector128_.PackSignedSaturate(left, right); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,157 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Six Labors Split License.
|
|||
|
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
using SixLabors.ImageSharp.Common.Helpers; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
/// <content>
|
|||
/// <see cref="Vector128{Single}"/> version of <see cref="Block8x8F"/>.
|
|||
/// </content>
|
|||
internal partial struct Block8x8F |
|||
{ |
|||
/// <summary>
|
|||
/// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
|
|||
/// </summary>
|
|||
public const int RowCount = 8; |
|||
|
|||
#pragma warning disable SA1310 // Field names should not contain underscore
|
|||
[FieldOffset(0)] |
|||
public Vector256<float> V256_0; |
|||
[FieldOffset(32)] |
|||
public Vector256<float> V256_1; |
|||
[FieldOffset(64)] |
|||
public Vector256<float> V256_2; |
|||
[FieldOffset(96)] |
|||
public Vector256<float> V256_3; |
|||
[FieldOffset(128)] |
|||
public Vector256<float> V256_4; |
|||
[FieldOffset(160)] |
|||
public Vector256<float> V256_5; |
|||
[FieldOffset(192)] |
|||
public Vector256<float> V256_6; |
|||
[FieldOffset(224)] |
|||
public Vector256<float> V256_7; |
|||
#pragma warning restore SA1310 // Field names should not contain underscore
|
|||
|
|||
/// <summary>
|
|||
/// <see cref="Vector256{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
|
|||
/// </summary>
|
|||
/// <param name="maximum">The maximum value to normalize to.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public void NormalizeColorsAndRoundInPlaceVector256(float maximum) |
|||
{ |
|||
Vector256<float> max = Vector256.Create(maximum); |
|||
Vector256<float> off = Vector256.Ceiling(max * .5F); |
|||
|
|||
this.V256_0 = NormalizeAndRoundVector256(this.V256_0, off, max); |
|||
this.V256_1 = NormalizeAndRoundVector256(this.V256_1, off, max); |
|||
this.V256_2 = NormalizeAndRoundVector256(this.V256_2, off, max); |
|||
this.V256_3 = NormalizeAndRoundVector256(this.V256_3, off, max); |
|||
this.V256_4 = NormalizeAndRoundVector256(this.V256_4, off, max); |
|||
this.V256_5 = NormalizeAndRoundVector256(this.V256_5, off, max); |
|||
this.V256_6 = NormalizeAndRoundVector256(this.V256_6, off, max); |
|||
this.V256_7 = NormalizeAndRoundVector256(this.V256_7, off, max); |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Loads values from <paramref name="source"/> using <see cref="Vector256{T}"/> intrinsics.
|
|||
/// </summary>
|
|||
/// <param name="source">The source <see cref="Block8x8"/></param>
|
|||
public void LoadFromInt16ExtendedVector256(ref Block8x8 source) |
|||
{ |
|||
DebugGuard.IsTrue( |
|||
Vector256.IsHardwareAccelerated, |
|||
"LoadFromInt16ExtendedVector256 only works on Vector256 compatible architecture!"); |
|||
|
|||
ref short sRef = ref Unsafe.As<Block8x8, short>(ref source); |
|||
ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this); |
|||
|
|||
// Vector256<ushort>.Count == 16
|
|||
// We can process 2 block rows in a single step
|
|||
Vector256<int> top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef)); |
|||
Vector256<int> bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count)); |
|||
dRef = Vector256.ConvertToSingle(top); |
|||
Unsafe.Add(ref dRef, 1) = Vector256.ConvertToSingle(bottom); |
|||
|
|||
top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2))); |
|||
bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3))); |
|||
Unsafe.Add(ref dRef, 2) = Vector256.ConvertToSingle(top); |
|||
Unsafe.Add(ref dRef, 3) = Vector256.ConvertToSingle(bottom); |
|||
|
|||
top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4))); |
|||
bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5))); |
|||
Unsafe.Add(ref dRef, 4) = Vector256.ConvertToSingle(top); |
|||
Unsafe.Add(ref dRef, 5) = Vector256.ConvertToSingle(bottom); |
|||
|
|||
top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6))); |
|||
bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7))); |
|||
Unsafe.Add(ref dRef, 6) = Vector256.ConvertToSingle(top); |
|||
Unsafe.Add(ref dRef, 7) = Vector256.ConvertToSingle(bottom); |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
private static Vector256<float> NormalizeAndRoundVector256(Vector256<float> value, Vector256<float> off, Vector256<float> max) |
|||
=> Vector256_.RoundToNearestInteger(Vector256_.Clamp(value + off, Vector256<float>.Zero, max)); |
|||
|
|||
private static unsafe void MultiplyIntoInt16Vector256(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) |
|||
{ |
|||
DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to run this operation!"); |
|||
|
|||
ref Vector256<float> aBase = ref a.V256_0; |
|||
ref Vector256<float> bBase = ref b.V256_0; |
|||
ref Vector256<short> destRef = ref dest.V01; |
|||
|
|||
for (nuint i = 0; i < 8; i += 2) |
|||
{ |
|||
Vector256<int> row0 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0)); |
|||
Vector256<int> row1 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1)); |
|||
|
|||
Vector256<short> row = Vector256_.PackSignedSaturate(row0, row1); |
|||
row = Vector256.Shuffle(row.AsInt32(), Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7)).AsInt16(); |
|||
|
|||
Unsafe.Add(ref destRef, i / 2) = row; |
|||
} |
|||
} |
|||
|
|||
private void TransposeInPlaceVector256() |
|||
{ |
|||
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
|
|||
Vector256<float> r0 = this.V256_0.WithUpper(this.V4L.AsVector128()); |
|||
Vector256<float> r1 = this.V256_1.WithUpper(this.V5L.AsVector128()); |
|||
Vector256<float> r2 = this.V256_2.WithUpper(this.V6L.AsVector128()); |
|||
Vector256<float> r3 = this.V256_3.WithUpper(this.V7L.AsVector128()); |
|||
Vector256<float> r4 = this.V0R.AsVector128().ToVector256().WithUpper(this.V4R.AsVector128()); |
|||
Vector256<float> r5 = this.V1R.AsVector128().ToVector256().WithUpper(this.V5R.AsVector128()); |
|||
Vector256<float> r6 = this.V2R.AsVector128().ToVector256().WithUpper(this.V6R.AsVector128()); |
|||
Vector256<float> r7 = this.V3R.AsVector128().ToVector256().WithUpper(this.V7R.AsVector128()); |
|||
|
|||
Vector256<float> t0 = Avx.UnpackLow(r0, r1); |
|||
Vector256<float> t2 = Avx.UnpackLow(r2, r3); |
|||
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E); |
|||
this.V256_0 = Avx.Blend(t0, v, 0xCC); |
|||
this.V256_1 = Avx.Blend(t2, v, 0x33); |
|||
|
|||
Vector256<float> t4 = Avx.UnpackLow(r4, r5); |
|||
Vector256<float> t6 = Avx.UnpackLow(r6, r7); |
|||
v = Avx.Shuffle(t4, t6, 0x4E); |
|||
this.V256_4 = Avx.Blend(t4, v, 0xCC); |
|||
this.V256_5 = Avx.Blend(t6, v, 0x33); |
|||
|
|||
Vector256<float> t1 = Avx.UnpackHigh(r0, r1); |
|||
Vector256<float> t3 = Avx.UnpackHigh(r2, r3); |
|||
v = Avx.Shuffle(t1, t3, 0x4E); |
|||
this.V256_2 = Avx.Blend(t1, v, 0xCC); |
|||
this.V256_3 = Avx.Blend(t3, v, 0x33); |
|||
|
|||
Vector256<float> t5 = Avx.UnpackHigh(r4, r5); |
|||
Vector256<float> t7 = Avx.UnpackHigh(r6, r7); |
|||
v = Avx.Shuffle(t5, t7, 0x4E); |
|||
this.V256_6 = Avx.Blend(t5, v, 0xCC); |
|||
this.V256_7 = Avx.Blend(t7, v, 0x33); |
|||
} |
|||
} |
|||
@ -1,142 +0,0 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Six Labors Split License.
|
|||
|
|||
using System.Runtime.Intrinsics; |
|||
using System.Runtime.Intrinsics.X86; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
internal static partial class FloatingPointDCT |
|||
{ |
|||
/// <summary>
|
|||
/// Apply floating point FDCT inplace using simd operations.
|
|||
/// </summary>
|
|||
/// <param name="block">Input block.</param>
|
|||
private static void FDCT8x8_Avx(ref Block8x8F block) |
|||
{ |
|||
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); |
|||
|
|||
// First pass - process columns
|
|||
FDCT8x8_1D_Avx(ref block); |
|||
|
|||
// Second pass - process rows
|
|||
block.TransposeInplace(); |
|||
FDCT8x8_1D_Avx(ref block); |
|||
|
|||
// Applies 1D floating point FDCT inplace
|
|||
static void FDCT8x8_1D_Avx(ref Block8x8F block) |
|||
{ |
|||
Vector256<float> tmp0 = Avx.Add(block.V0, block.V7); |
|||
Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7); |
|||
Vector256<float> tmp1 = Avx.Add(block.V1, block.V6); |
|||
Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6); |
|||
Vector256<float> tmp2 = Avx.Add(block.V2, block.V5); |
|||
Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5); |
|||
Vector256<float> tmp3 = Avx.Add(block.V3, block.V4); |
|||
Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4); |
|||
|
|||
// Even part
|
|||
Vector256<float> tmp10 = Avx.Add(tmp0, tmp3); |
|||
Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3); |
|||
Vector256<float> tmp11 = Avx.Add(tmp1, tmp2); |
|||
Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2); |
|||
|
|||
block.V0 = Avx.Add(tmp10, tmp11); |
|||
block.V4 = Avx.Subtract(tmp10, tmp11); |
|||
|
|||
var mm256_F_0_7071 = Vector256.Create(0.707106781f); |
|||
Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); |
|||
block.V2 = Avx.Add(tmp13, z1); |
|||
block.V6 = Avx.Subtract(tmp13, z1); |
|||
|
|||
// Odd part
|
|||
tmp10 = Avx.Add(tmp4, tmp5); |
|||
tmp11 = Avx.Add(tmp5, tmp6); |
|||
tmp12 = Avx.Add(tmp6, tmp7); |
|||
|
|||
Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), Vector256.Create(0.382683433f)); // mm256_F_0_3826
|
|||
Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411
|
|||
Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065
|
|||
Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071); |
|||
|
|||
Vector256<float> z11 = Avx.Add(tmp7, z3); |
|||
Vector256<float> z13 = Avx.Subtract(tmp7, z3); |
|||
|
|||
block.V5 = Avx.Add(z13, z2); |
|||
block.V3 = Avx.Subtract(z13, z2); |
|||
block.V1 = Avx.Add(z11, z4); |
|||
block.V7 = Avx.Subtract(z11, z4); |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Apply floating point IDCT inplace using simd operations.
|
|||
/// </summary>
|
|||
/// <param name="transposedBlock">Transposed input block.</param>
|
|||
private static void IDCT8x8_Avx(ref Block8x8F transposedBlock) |
|||
{ |
|||
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); |
|||
|
|||
// First pass - process columns
|
|||
IDCT8x8_1D_Avx(ref transposedBlock); |
|||
|
|||
// Second pass - process rows
|
|||
transposedBlock.TransposeInplace(); |
|||
IDCT8x8_1D_Avx(ref transposedBlock); |
|||
|
|||
// Applies 1D floating point FDCT inplace
|
|||
static void IDCT8x8_1D_Avx(ref Block8x8F block) |
|||
{ |
|||
// Even part
|
|||
Vector256<float> tmp0 = block.V0; |
|||
Vector256<float> tmp1 = block.V2; |
|||
Vector256<float> tmp2 = block.V4; |
|||
Vector256<float> tmp3 = block.V6; |
|||
|
|||
Vector256<float> z5 = tmp0; |
|||
Vector256<float> tmp10 = Avx.Add(z5, tmp2); |
|||
Vector256<float> tmp11 = Avx.Subtract(z5, tmp2); |
|||
|
|||
var mm256_F_1_4142 = Vector256.Create(1.414213562f); |
|||
Vector256<float> tmp13 = Avx.Add(tmp1, tmp3); |
|||
Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142); |
|||
|
|||
tmp0 = Avx.Add(tmp10, tmp13); |
|||
tmp3 = Avx.Subtract(tmp10, tmp13); |
|||
tmp1 = Avx.Add(tmp11, tmp12); |
|||
tmp2 = Avx.Subtract(tmp11, tmp12); |
|||
|
|||
// Odd part
|
|||
Vector256<float> tmp4 = block.V1; |
|||
Vector256<float> tmp5 = block.V3; |
|||
Vector256<float> tmp6 = block.V5; |
|||
Vector256<float> tmp7 = block.V7; |
|||
|
|||
Vector256<float> z13 = Avx.Add(tmp6, tmp5); |
|||
Vector256<float> z10 = Avx.Subtract(tmp6, tmp5); |
|||
Vector256<float> z11 = Avx.Add(tmp4, tmp7); |
|||
Vector256<float> z12 = Avx.Subtract(tmp4, tmp7); |
|||
|
|||
tmp7 = Avx.Add(z11, z13); |
|||
tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142); |
|||
|
|||
z5 = Avx.Multiply(Avx.Add(z10, z12), Vector256.Create(1.847759065f)); // mm256_F_1_8477
|
|||
|
|||
tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823
|
|||
tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131
|
|||
|
|||
tmp6 = Avx.Subtract(tmp12, tmp7); |
|||
tmp5 = Avx.Subtract(tmp11, tmp6); |
|||
tmp4 = Avx.Subtract(tmp10, tmp5); |
|||
|
|||
block.V0 = Avx.Add(tmp0, tmp7); |
|||
block.V7 = Avx.Subtract(tmp0, tmp7); |
|||
block.V1 = Avx.Add(tmp1, tmp6); |
|||
block.V6 = Avx.Subtract(tmp1, tmp6); |
|||
block.V2 = Avx.Add(tmp2, tmp5); |
|||
block.V5 = Avx.Subtract(tmp2, tmp5); |
|||
block.V3 = Avx.Add(tmp3, tmp4); |
|||
block.V4 = Avx.Subtract(tmp3, tmp4); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,142 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Six Labors Split License.
|
|||
|
|||
using System.Runtime.Intrinsics; |
|||
using SixLabors.ImageSharp.Common.Helpers; |
|||
|
|||
namespace SixLabors.ImageSharp.Formats.Jpeg.Components; |
|||
|
|||
internal static partial class FloatingPointDCT |
|||
{ |
|||
/// <summary>
|
|||
/// Apply floating point FDCT in place using simd operations.
|
|||
/// </summary>
|
|||
/// <param name="block">Input block.</param>
|
|||
private static void FDCT8x8_Vector256(ref Block8x8F block) |
|||
{ |
|||
DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation."); |
|||
|
|||
// First pass - process columns
|
|||
FDCT8x8_1D_Vector256(ref block); |
|||
|
|||
// Second pass - process rows
|
|||
block.TransposeInPlace(); |
|||
FDCT8x8_1D_Vector256(ref block); |
|||
|
|||
// Applies 1D floating point FDCT in place
|
|||
static void FDCT8x8_1D_Vector256(ref Block8x8F block) |
|||
{ |
|||
Vector256<float> tmp0 = block.V256_0 + block.V256_7; |
|||
Vector256<float> tmp7 = block.V256_0 - block.V256_7; |
|||
Vector256<float> tmp1 = block.V256_1 + block.V256_6; |
|||
Vector256<float> tmp6 = block.V256_1 - block.V256_6; |
|||
Vector256<float> tmp2 = block.V256_2 + block.V256_5; |
|||
Vector256<float> tmp5 = block.V256_2 - block.V256_5; |
|||
Vector256<float> tmp3 = block.V256_3 + block.V256_4; |
|||
Vector256<float> tmp4 = block.V256_3 - block.V256_4; |
|||
|
|||
// Even part
|
|||
Vector256<float> tmp10 = tmp0 + tmp3; |
|||
Vector256<float> tmp13 = tmp0 - tmp3; |
|||
Vector256<float> tmp11 = tmp1 + tmp2; |
|||
Vector256<float> tmp12 = tmp1 - tmp2; |
|||
|
|||
block.V256_0 = tmp10 + tmp11; |
|||
block.V256_4 = tmp10 - tmp11; |
|||
|
|||
Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f); |
|||
Vector256<float> z1 = (tmp12 + tmp13) * mm256_F_0_7071; |
|||
block.V256_2 = tmp13 + z1; |
|||
block.V256_6 = tmp13 - z1; |
|||
|
|||
// Odd part
|
|||
tmp10 = tmp4 + tmp5; |
|||
tmp11 = tmp5 + tmp6; |
|||
tmp12 = tmp6 + tmp7; |
|||
|
|||
Vector256<float> z5 = (tmp10 - tmp12) * Vector256.Create(0.382683433f); // mm256_F_0_3826
|
|||
Vector256<float> z2 = Vector256_.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411
|
|||
Vector256<float> z4 = Vector256_.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065
|
|||
Vector256<float> z3 = tmp11 * mm256_F_0_7071; |
|||
|
|||
Vector256<float> z11 = tmp7 + z3; |
|||
Vector256<float> z13 = tmp7 - z3; |
|||
|
|||
block.V256_5 = z13 + z2; |
|||
block.V256_3 = z13 - z2; |
|||
block.V256_1 = z11 + z4; |
|||
block.V256_7 = z11 - z4; |
|||
} |
|||
} |
|||
|
|||
/// <summary>
|
|||
/// Apply floating point IDCT in place using simd operations.
|
|||
/// </summary>
|
|||
/// <param name="transposedBlock">Transposed input block.</param>
|
|||
private static void IDCT8x8_Vector256(ref Block8x8F transposedBlock) |
|||
{ |
|||
DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation."); |
|||
|
|||
// First pass - process columns
|
|||
IDCT8x8_1D_Vector256(ref transposedBlock); |
|||
|
|||
// Second pass - process rows
|
|||
transposedBlock.TransposeInPlace(); |
|||
IDCT8x8_1D_Vector256(ref transposedBlock); |
|||
|
|||
// Applies 1D floating point FDCT in place
|
|||
static void IDCT8x8_1D_Vector256(ref Block8x8F block) |
|||
{ |
|||
// Even part
|
|||
Vector256<float> tmp0 = block.V256_0; |
|||
Vector256<float> tmp1 = block.V256_2; |
|||
Vector256<float> tmp2 = block.V256_4; |
|||
Vector256<float> tmp3 = block.V256_6; |
|||
|
|||
Vector256<float> z5 = tmp0; |
|||
Vector256<float> tmp10 = z5 + tmp2; |
|||
Vector256<float> tmp11 = z5 - tmp2; |
|||
|
|||
Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f); |
|||
Vector256<float> tmp13 = tmp1 + tmp3; |
|||
Vector256<float> tmp12 = Vector256_.MultiplySubtract(tmp13, tmp1 - tmp3, mm256_F_1_4142); |
|||
|
|||
tmp0 = tmp10 + tmp13; |
|||
tmp3 = tmp10 - tmp13; |
|||
tmp1 = tmp11 + tmp12; |
|||
tmp2 = tmp11 - tmp12; |
|||
|
|||
// Odd part
|
|||
Vector256<float> tmp4 = block.V256_1; |
|||
Vector256<float> tmp5 = block.V256_3; |
|||
Vector256<float> tmp6 = block.V256_5; |
|||
Vector256<float> tmp7 = block.V256_7; |
|||
|
|||
Vector256<float> z13 = tmp6 + tmp5; |
|||
Vector256<float> z10 = tmp6 - tmp5; |
|||
Vector256<float> z11 = tmp4 + tmp7; |
|||
Vector256<float> z12 = tmp4 - tmp7; |
|||
|
|||
tmp7 = z11 + z13; |
|||
tmp11 = (z11 - z13) * mm256_F_1_4142; |
|||
|
|||
z5 = (z10 + z12) * Vector256.Create(1.847759065f); // mm256_F_1_8477
|
|||
|
|||
tmp10 = Vector256_.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823
|
|||
tmp12 = Vector256_.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131
|
|||
|
|||
tmp6 = tmp12 - tmp7; |
|||
tmp5 = tmp11 - tmp6; |
|||
tmp4 = tmp10 - tmp5; |
|||
|
|||
block.V256_0 = tmp0 + tmp7; |
|||
block.V256_7 = tmp0 - tmp7; |
|||
block.V256_1 = tmp1 + tmp6; |
|||
block.V256_6 = tmp1 - tmp6; |
|||
block.V256_2 = tmp2 + tmp5; |
|||
block.V256_5 = tmp2 - tmp5; |
|||
block.V256_3 = tmp3 + tmp4; |
|||
block.V256_4 = tmp3 - tmp4; |
|||
} |
|||
} |
|||
} |
|||
Loading…
Reference in new issue