Browse Source

Merge pull request #1817 from SixLabors/bp/sse4X4

Add SSE2 version of Vp8Sse4X4
pull/1820/head
James Jackson-South 4 years ago
committed by GitHub
parent
commit
7d74c4c95a
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 66
      src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
  2. 16
      src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
  3. 3
      src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
  4. 1
      src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
  5. 17
      src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs
  6. 38
      tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

66
src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

@ -19,17 +19,63 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
#endif
// Note: method name in libwebp reference implementation is called VP8SSE16x16.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 16);
// Note: method name in libwebp reference implementation is called VP8SSE16x8.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X8(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 8);
public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 8);
// Note: method name in libwebp reference implementation is called VP8SSE4x4.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse4X4(Span<byte> a, Span<byte> b) => GetSse(a, b, 4, 4);
public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
// Load values.
ref byte aRef = ref MemoryMarshal.GetReference(a);
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref aRef);
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps));
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2));
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3));
ref byte bRef = ref MemoryMarshal.GetReference(b);
Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref bRef);
Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps));
Vector128<byte> b2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2));
Vector128<byte> b3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3));
// Combine pair of lines.
Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());
// Convert to 16b.
Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);
// subtract, square and accumulate.
Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
Vector128<int> sum = Sse2.Add(e0, e1);
return Numerics.ReduceSum(sum);
}
else
#endif
{
return Vp8_SseNxN(a, b, 4, 4);
}
}
[MethodImpl(InliningOptions.ShortMethod)]
public static int GetSse(Span<byte> a, Span<byte> b, int w, int h)
public static int Vp8_SseNxN(Span<byte> a, Span<byte> b, int w, int h)
{
int count = 0;
int aOffset = 0;
@ -88,7 +134,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse41.IsSupported)
{
int diffSum = TTransformSse41(a, b, w, scratch);
int diffSum = TTransformSse41(a, b, w);
return Math.Abs(diffSum) >> 5;
}
else
@ -615,11 +661,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
/// Returns the weighted sum of the absolute value of transformed coefficients.
/// w[] contains a row-major 4 by 4 symmetric matrix.
/// </summary>
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w, Span<int> scratch)
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w)
{
Span<int> sum = scratch.Slice(0, 4);
sum.Clear();
// Load and combine inputs.
Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
@ -724,9 +767,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// difference of weighted sums.
Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
ref int outputRef = ref MemoryMarshal.GetReference(sum);
Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
return sum[3] + sum[2] + sum[1] + sum[0];
return Numerics.ReduceSum(result);
}
#endif
@ -739,7 +780,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public static void TransformOne(Span<short> src, Span<byte> dst, Span<int> scratch)
{
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
int tmpOffset = 0;
for (int srcOffset = 0; srcOffset < 4; srcOffset++)
{

16
src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs

@ -66,7 +66,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
rdCur.Nz = (uint)ReconstructIntra16(it, dqm, rdCur, tmpDst, mode);
// Measure RD-score.
rdCur.D = LossyUtils.Vp8Sse16X16(src, tmpDst);
rdCur.D = LossyUtils.Vp8_Sse16X16(src, tmpDst);
rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY, scratch)) : 0;
rdCur.H = WebpConstants.Vp8FixedCostsI16[mode];
rdCur.R = it.GetCostLuma16(rdCur, proba, res);
@ -160,7 +160,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
rdTmp.Nz = (uint)ReconstructIntra4(it, dqm, tmpLevels, src, tmpDst, mode);
// Compute RD-score.
rdTmp.D = LossyUtils.Vp8Sse4X4(src, tmpDst);
rdTmp.D = LossyUtils.Vp8_Sse4X4(src, tmpDst);
rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY, scratch)) : 0;
rdTmp.H = modeCosts[mode];
@ -251,7 +251,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
rdUv.Nz = (uint)ReconstructUv(it, dqm, rdUv, tmpDst, mode);
// Compute RD-score
rdUv.D = LossyUtils.Vp8Sse16X8(src, tmpDst);
rdUv.D = LossyUtils.Vp8_Sse16X8(src, tmpDst);
rdUv.SD = 0; // not calling TDisto here: it tends to flatten areas.
rdUv.H = WebpConstants.Vp8FixedCostsUv[mode];
rdUv.R = it.GetCostUv(rdUv, proba, res);
@ -340,8 +340,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
Span<short> tmp = it.Scratch2.AsSpan(0, 16);
Span<int> scratch = it.Scratch3.AsSpan(0, 16);
tmp.Clear();
scratch.Clear();
Vp8Encoding.FTransform(src, reference, tmp, scratch);
int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
@ -357,8 +355,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
int n;
Span<short> tmp = it.Scratch2.AsSpan(0, 8 * 16);
Span<int> scratch = it.Scratch3.AsSpan(0, 16);
tmp.Clear();
scratch.Clear();
for (n = 0; n < 8; n += 2)
{
@ -411,7 +407,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
{
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I16ModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);
long score = (LossyUtils.Vp8_Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);
if (mode > 0 && WebpConstants.Vp8FixedCostsI16[mode] > bitLimit)
{
@ -458,7 +454,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
for (mode = 0; mode < WebpConstants.NumBModes; ++mode)
{
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
long score = (LossyUtils.Vp8_Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
if (score < bestI4Score)
{
bestI4Mode = mode;
@ -507,7 +503,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
{
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8UvModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
long score = (LossyUtils.Vp8_Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
if (score < bestUvScore)
{
bestMode = mode;

3
src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs

@ -81,7 +81,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
int i;
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
for (i = 0; i < 4; i++)
{
// vertical pass.
@ -124,7 +123,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
int i;
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
int srcIdx = 0;
int refIdx = 0;
@ -163,7 +161,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public static void FTransformWht(Span<short> input, Span<short> output, Span<int> scratch)
{
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
int i;
int inputIdx = 0;

1
src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs

@ -49,7 +49,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
this.distribution.AsSpan().Clear();
for (j = startBlock; j < endBlock; j++)
{
this.output.AsSpan().Clear();
this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output);
// Convert coefficients to bin.

17
src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs

@ -97,18 +97,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public void Clear()
{
this.YDcLevels.AsSpan().Clear();
this.YAcLevels.AsSpan().Clear();
this.UvLevels.AsSpan().Clear();
this.ModesI4.AsSpan().Clear();
for (int i = 0; i < 2; i++)
{
for (int j = 0; j < 3; j++)
{
this.Derr[i, j] = 0;
}
}
Array.Clear(this.YDcLevels, 0, this.YDcLevels.Length);
Array.Clear(this.YAcLevels, 0, this.YAcLevels.Length);
Array.Clear(this.UvLevels, 0, this.UvLevels.Length);
Array.Clear(this.ModesI4, 0, this.ModesI4.Length);
Array.Clear(this.Derr, 0, this.Derr.Length);
}
public void InitScore()

38
tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

@ -11,6 +11,35 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
[Trait("Format", "Webp")]
public class LossyUtilsTests
{
private static void RunVp8Sse4X4Test()
{
byte[] a =
{
27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129,
129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28,
28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26,
26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128,
128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27,
129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128
};
byte[] b =
{
26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26,
26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204
};
int expected = 27;
int actual = LossyUtils.Vp8_Sse4X4(a, b);
Assert.Equal(expected, actual);
}
private static void RunMean16x4Test()
{
// arrange
@ -61,6 +90,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
Assert.Equal(expected, actual);
}
[Fact]
public void Vp8Sse4X4_Works() => RunVp8Sse4X4Test();
[Fact]
public void Mean16x4_Works() => RunMean16x4Test();
@ -68,6 +100,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
public void HadamardTransform_Works() => RunHadamardTransformTest();
#if SUPPORTS_RUNTIME_INTRINSICS
[Fact]
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);
[Fact]
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic);
[Fact]
public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);

Loading…
Cancel
Save