Browse Source

Merge pull request #1817 from SixLabors/bp/sse4X4

Add SSE2 version of Vp8Sse4X4
pull/1820/head
James Jackson-South 4 years ago
committed by GitHub
parent
commit
7d74c4c95a
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 66
      src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
  2. 16
      src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
  3. 3
      src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
  4. 1
      src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
  5. 17
      src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs
  6. 38
      tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

66
src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

@ -19,17 +19,63 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte(); private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
#endif #endif
// Note: method name in libwebp reference implementation is called VP8SSE16x16.
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16); public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 16);
// Note: method name in libwebp reference implementation is called VP8SSE16x8.
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X8(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 8); public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 8);
// Note: method name in libwebp reference implementation is called VP8SSE4x4.
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse4X4(Span<byte> a, Span<byte> b) => GetSse(a, b, 4, 4); public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
// Load values.
ref byte aRef = ref MemoryMarshal.GetReference(a);
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref aRef);
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps));
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2));
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3));
ref byte bRef = ref MemoryMarshal.GetReference(b);
Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref bRef);
Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps));
Vector128<byte> b2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2));
Vector128<byte> b3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3));
// Combine pair of lines.
Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());
// Convert to 16b.
Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);
// subtract, square and accumulate.
Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
Vector128<int> sum = Sse2.Add(e0, e1);
return Numerics.ReduceSum(sum);
}
else
#endif
{
return Vp8_SseNxN(a, b, 4, 4);
}
}
[MethodImpl(InliningOptions.ShortMethod)] [MethodImpl(InliningOptions.ShortMethod)]
public static int GetSse(Span<byte> a, Span<byte> b, int w, int h) public static int Vp8_SseNxN(Span<byte> a, Span<byte> b, int w, int h)
{ {
int count = 0; int count = 0;
int aOffset = 0; int aOffset = 0;
@ -88,7 +134,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
#if SUPPORTS_RUNTIME_INTRINSICS #if SUPPORTS_RUNTIME_INTRINSICS
if (Sse41.IsSupported) if (Sse41.IsSupported)
{ {
int diffSum = TTransformSse41(a, b, w, scratch); int diffSum = TTransformSse41(a, b, w);
return Math.Abs(diffSum) >> 5; return Math.Abs(diffSum) >> 5;
} }
else else
@ -615,11 +661,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
/// Returns the weighted sum of the absolute value of transformed coefficients. /// Returns the weighted sum of the absolute value of transformed coefficients.
/// w[] contains a row-major 4 by 4 symmetric matrix. /// w[] contains a row-major 4 by 4 symmetric matrix.
/// </summary> /// </summary>
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w, Span<int> scratch) public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w)
{ {
Span<int> sum = scratch.Slice(0, 4);
sum.Clear();
// Load and combine inputs. // Load and combine inputs.
Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA)); Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16))); Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
@ -724,9 +767,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// difference of weighted sums. // difference of weighted sums.
Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32()); Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
ref int outputRef = ref MemoryMarshal.GetReference(sum); return Numerics.ReduceSum(result);
Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
return sum[3] + sum[2] + sum[1] + sum[0];
} }
#endif #endif
@ -739,7 +780,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public static void TransformOne(Span<short> src, Span<byte> dst, Span<int> scratch) public static void TransformOne(Span<short> src, Span<byte> dst, Span<int> scratch)
{ {
Span<int> tmp = scratch.Slice(0, 16); Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
int tmpOffset = 0; int tmpOffset = 0;
for (int srcOffset = 0; srcOffset < 4; srcOffset++) for (int srcOffset = 0; srcOffset < 4; srcOffset++)
{ {

16
src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs

@ -66,7 +66,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
rdCur.Nz = (uint)ReconstructIntra16(it, dqm, rdCur, tmpDst, mode); rdCur.Nz = (uint)ReconstructIntra16(it, dqm, rdCur, tmpDst, mode);
// Measure RD-score. // Measure RD-score.
rdCur.D = LossyUtils.Vp8Sse16X16(src, tmpDst); rdCur.D = LossyUtils.Vp8_Sse16X16(src, tmpDst);
rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY, scratch)) : 0; rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY, scratch)) : 0;
rdCur.H = WebpConstants.Vp8FixedCostsI16[mode]; rdCur.H = WebpConstants.Vp8FixedCostsI16[mode];
rdCur.R = it.GetCostLuma16(rdCur, proba, res); rdCur.R = it.GetCostLuma16(rdCur, proba, res);
@ -160,7 +160,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
rdTmp.Nz = (uint)ReconstructIntra4(it, dqm, tmpLevels, src, tmpDst, mode); rdTmp.Nz = (uint)ReconstructIntra4(it, dqm, tmpLevels, src, tmpDst, mode);
// Compute RD-score. // Compute RD-score.
rdTmp.D = LossyUtils.Vp8Sse4X4(src, tmpDst); rdTmp.D = LossyUtils.Vp8_Sse4X4(src, tmpDst);
rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY, scratch)) : 0; rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY, scratch)) : 0;
rdTmp.H = modeCosts[mode]; rdTmp.H = modeCosts[mode];
@ -251,7 +251,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
rdUv.Nz = (uint)ReconstructUv(it, dqm, rdUv, tmpDst, mode); rdUv.Nz = (uint)ReconstructUv(it, dqm, rdUv, tmpDst, mode);
// Compute RD-score // Compute RD-score
rdUv.D = LossyUtils.Vp8Sse16X8(src, tmpDst); rdUv.D = LossyUtils.Vp8_Sse16X8(src, tmpDst);
rdUv.SD = 0; // not calling TDisto here: it tends to flatten areas. rdUv.SD = 0; // not calling TDisto here: it tends to flatten areas.
rdUv.H = WebpConstants.Vp8FixedCostsUv[mode]; rdUv.H = WebpConstants.Vp8FixedCostsUv[mode];
rdUv.R = it.GetCostUv(rdUv, proba, res); rdUv.R = it.GetCostUv(rdUv, proba, res);
@ -340,8 +340,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]); Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
Span<short> tmp = it.Scratch2.AsSpan(0, 16); Span<short> tmp = it.Scratch2.AsSpan(0, 16);
Span<int> scratch = it.Scratch3.AsSpan(0, 16); Span<int> scratch = it.Scratch3.AsSpan(0, 16);
tmp.Clear();
scratch.Clear();
Vp8Encoding.FTransform(src, reference, tmp, scratch); Vp8Encoding.FTransform(src, reference, tmp, scratch);
int nz = QuantizeBlock(tmp, levels, ref dqm.Y1); int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch); Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
@ -357,8 +355,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
int n; int n;
Span<short> tmp = it.Scratch2.AsSpan(0, 8 * 16); Span<short> tmp = it.Scratch2.AsSpan(0, 8 * 16);
Span<int> scratch = it.Scratch3.AsSpan(0, 16); Span<int> scratch = it.Scratch3.AsSpan(0, 16);
tmp.Clear();
scratch.Clear();
for (n = 0; n < 8; n += 2) for (n = 0; n < 8; n += 2)
{ {
@ -411,7 +407,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
for (mode = 0; mode < WebpConstants.NumPredModes; ++mode) for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
{ {
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I16ModeOffsets[mode]); Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I16ModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16); long score = (LossyUtils.Vp8_Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);
if (mode > 0 && WebpConstants.Vp8FixedCostsI16[mode] > bitLimit) if (mode > 0 && WebpConstants.Vp8FixedCostsI16[mode] > bitLimit)
{ {
@ -458,7 +454,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
for (mode = 0; mode < WebpConstants.NumBModes; ++mode) for (mode = 0; mode < WebpConstants.NumBModes; ++mode)
{ {
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]); Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4); long score = (LossyUtils.Vp8_Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
if (score < bestI4Score) if (score < bestI4Score)
{ {
bestI4Mode = mode; bestI4Mode = mode;
@ -507,7 +503,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
for (mode = 0; mode < WebpConstants.NumPredModes; ++mode) for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
{ {
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8UvModeOffsets[mode]); Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8UvModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv); long score = (LossyUtils.Vp8_Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
if (score < bestUvScore) if (score < bestUvScore)
{ {
bestMode = mode; bestMode = mode;

3
src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs

@ -81,7 +81,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{ {
int i; int i;
Span<int> tmp = scratch.Slice(0, 16); Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
{ {
// vertical pass. // vertical pass.
@ -124,7 +123,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{ {
int i; int i;
Span<int> tmp = scratch.Slice(0, 16); Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
int srcIdx = 0; int srcIdx = 0;
int refIdx = 0; int refIdx = 0;
@ -163,7 +161,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public static void FTransformWht(Span<short> input, Span<short> output, Span<int> scratch) public static void FTransformWht(Span<short> input, Span<short> output, Span<int> scratch)
{ {
Span<int> tmp = scratch.Slice(0, 16); Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
int i; int i;
int inputIdx = 0; int inputIdx = 0;

1
src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs

@ -49,7 +49,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
this.distribution.AsSpan().Clear(); this.distribution.AsSpan().Clear();
for (j = startBlock; j < endBlock; j++) for (j = startBlock; j < endBlock; j++)
{ {
this.output.AsSpan().Clear();
this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output); this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output);
// Convert coefficients to bin. // Convert coefficients to bin.

17
src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs

@ -97,18 +97,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public void Clear() public void Clear()
{ {
this.YDcLevels.AsSpan().Clear(); Array.Clear(this.YDcLevels, 0, this.YDcLevels.Length);
this.YAcLevels.AsSpan().Clear(); Array.Clear(this.YAcLevels, 0, this.YAcLevels.Length);
this.UvLevels.AsSpan().Clear(); Array.Clear(this.UvLevels, 0, this.UvLevels.Length);
this.ModesI4.AsSpan().Clear(); Array.Clear(this.ModesI4, 0, this.ModesI4.Length);
Array.Clear(this.Derr, 0, this.Derr.Length);
for (int i = 0; i < 2; i++)
{
for (int j = 0; j < 3; j++)
{
this.Derr[i, j] = 0;
}
}
} }
public void InitScore() public void InitScore()

38
tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

@ -11,6 +11,35 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
[Trait("Format", "Webp")] [Trait("Format", "Webp")]
public class LossyUtilsTests public class LossyUtilsTests
{ {
private static void RunVp8Sse4X4Test()
{
byte[] a =
{
27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129,
129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28,
28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26,
26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128,
128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27,
129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128
};
byte[] b =
{
26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26,
26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204
};
int expected = 27;
int actual = LossyUtils.Vp8_Sse4X4(a, b);
Assert.Equal(expected, actual);
}
private static void RunMean16x4Test() private static void RunMean16x4Test()
{ {
// arrange // arrange
@ -61,6 +90,9 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
Assert.Equal(expected, actual); Assert.Equal(expected, actual);
} }
[Fact]
public void Vp8Sse4X4_Works() => RunVp8Sse4X4Test();
[Fact] [Fact]
public void Mean16x4_Works() => RunMean16x4Test(); public void Mean16x4_Works() => RunMean16x4Test();
@ -68,6 +100,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
public void HadamardTransform_Works() => RunHadamardTransformTest(); public void HadamardTransform_Works() => RunHadamardTransformTest();
#if SUPPORTS_RUNTIME_INTRINSICS #if SUPPORTS_RUNTIME_INTRINSICS
[Fact]
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);
[Fact]
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic);
[Fact] [Fact]
public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll); public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);

Loading…
Cancel
Save