diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index ba5c588ca5..fa0af823d5 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -820,6 +820,26 @@ namespace SixLabors.ImageSharp
}
}
+ ///
+ /// Reduces elements of the vector into one sum.
+ ///
+ /// The accumulator to reduce.
+ /// The sum of all elements.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static int ReduceSum(Vector256 accumulator)
+ {
+ // Add upper lane to lower lane.
+ Vector128 vsum = Sse2.Add(accumulator.GetLower(), accumulator.GetUpper());
+
+ // Add odd to even.
+ vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_11_01_01));
+
+ // Add high to low.
+ vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10));
+
+ return Sse2.ConvertToInt32(vsum);
+ }
+
///
/// Reduces even elements of the vector into one sum.
///
diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
index 471c083cda..ebb198a2d8 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -2,6 +2,7 @@
// Licensed under the Apache License, Version 2.0.
using System;
+using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Memory;
@@ -80,8 +81,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
public static int VectorMismatch(ReadOnlySpan array1, ReadOnlySpan array2, int length)
{
int matchLen = 0;
+ ref uint array1Ref = ref MemoryMarshal.GetReference(array1);
+ ref uint array2Ref = ref MemoryMarshal.GetReference(array2);
- while (matchLen < length && array1[matchLen] == array2[matchLen])
+ while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen))
{
matchLen++;
}
@@ -759,28 +762,184 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
/// Shanon entropy.
public static float CombinedShannonEntropy(Span x, Span y)
{
- double retVal = 0.0d;
- uint sumX = 0, sumXY = 0;
- for (int i = 0; i < 256; i++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx2.IsSupported)
{
- uint xi = (uint)x[i];
- if (xi != 0)
+ double retVal = 0.0d;
+ Vector256 tmp = Vector256.Zero; // has the size of the scratch space of sizeof(int) * 8
+ ref int xRef = ref MemoryMarshal.GetReference(x);
+ ref int yRef = ref MemoryMarshal.GetReference(y);
+ Vector256 sumXY256 = Vector256.Zero;
+ Vector256 sumX256 = Vector256.Zero;
+ ref int tmpRef = ref Unsafe.As, int>(ref tmp);
+ for (nint i = 0; i < 256; i += 8)
{
- uint xy = xi + (uint)y[i];
- sumX += xi;
- retVal -= FastSLog2(xi);
- sumXY += xy;
- retVal -= FastSLog2(xy);
+ Vector256 xVec = Unsafe.As>(ref Unsafe.Add(ref xRef, i));
+ Vector256 yVec = Unsafe.As>(ref Unsafe.Add(ref yRef, i));
+
+ // Check if any X is non-zero: this actually provides a speedup as X is usually sparse.
+ int mask = Avx2.MoveMask(Avx2.CompareEqual(xVec, Vector256.Zero).AsByte());
+ if (mask != -1)
+ {
+ Vector256 xy256 = Avx2.Add(xVec, yVec);
+ sumXY256 = Avx2.Add(sumXY256, xy256);
+ sumX256 = Avx2.Add(sumX256, xVec);
+
+ // Analyze the different X + Y.
+ Unsafe.As>(ref tmpRef) = xy256;
+ if (tmpRef != 0)
+ {
+ retVal -= FastSLog2((uint)tmpRef);
+ if (Unsafe.Add(ref xRef, i) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i));
+ }
+ }
+
+ if (Unsafe.Add(ref tmpRef, 1) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 1));
+ if (Unsafe.Add(ref xRef, i + 1) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 1));
+ }
+ }
+
+ if (Unsafe.Add(ref tmpRef, 2) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 2));
+ if (Unsafe.Add(ref xRef, i + 2) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 2));
+ }
+ }
+
+ if (Unsafe.Add(ref tmpRef, 3) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 3));
+ if (Unsafe.Add(ref xRef, i + 3) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 3));
+ }
+ }
+
+ if (Unsafe.Add(ref tmpRef, 4) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 4));
+ if (Unsafe.Add(ref xRef, i + 4) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 4));
+ }
+ }
+
+ if (Unsafe.Add(ref tmpRef, 5) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 5));
+ if (Unsafe.Add(ref xRef, i + 5) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 5));
+ }
+ }
+
+ if (Unsafe.Add(ref tmpRef, 6) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 6));
+ if (Unsafe.Add(ref xRef, i + 6) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 6));
+ }
+ }
+
+ if (Unsafe.Add(ref tmpRef, 7) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref tmpRef, 7));
+ if (Unsafe.Add(ref xRef, i + 7) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref xRef, i + 7));
+ }
+ }
+ }
+ else
+ {
+ // X is fully 0, so only deal with Y.
+ sumXY256 = Avx2.Add(sumXY256, yVec);
+
+ if (Unsafe.Add(ref yRef, i) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i));
+ }
+
+ if (Unsafe.Add(ref yRef, i + 1) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 1));
+ }
+
+ if (Unsafe.Add(ref yRef, i + 2) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 2));
+ }
+
+ if (Unsafe.Add(ref yRef, i + 3) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 3));
+ }
+
+ if (Unsafe.Add(ref yRef, i + 4) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 4));
+ }
+
+ if (Unsafe.Add(ref yRef, i + 5) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 5));
+ }
+
+ if (Unsafe.Add(ref yRef, i + 6) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 6));
+ }
+
+ if (Unsafe.Add(ref yRef, i + 7) != 0)
+ {
+ retVal -= FastSLog2((uint)Unsafe.Add(ref yRef, i + 7));
+ }
+ }
}
- else if (y[i] != 0)
+
+ // Sum up sumX256 to get sumX and sum up sumXY256 to get sumXY.
+ int sumX = Numerics.ReduceSum(sumX256);
+ int sumXY = Numerics.ReduceSum(sumXY256);
+
+ retVal += FastSLog2((uint)sumX) + FastSLog2((uint)sumXY);
+
+ return (float)retVal;
+ }
+ else
+#endif
+ {
+ double retVal = 0.0d;
+ uint sumX = 0, sumXY = 0;
+ for (int i = 0; i < 256; i++)
{
- sumXY += (uint)y[i];
- retVal -= FastSLog2((uint)y[i]);
+ uint xi = (uint)x[i];
+ if (xi != 0)
+ {
+ uint xy = xi + (uint)y[i];
+ sumX += xi;
+ retVal -= FastSLog2(xi);
+ sumXY += xy;
+ retVal -= FastSLog2(xy);
+ }
+ else if (y[i] != 0)
+ {
+ sumXY += (uint)y[i];
+ retVal -= FastSLog2((uint)y[i]);
+ }
}
- }
- retVal += FastSLog2(sumX) + FastSLog2(sumXY);
- return (float)retVal;
+ retVal += FastSLog2(sumX) + FastSLog2(sumXY);
+ return (float)retVal;
+ }
}
[MethodImpl(InliningOptions.ShortMethod)]
@@ -836,6 +995,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
private static float FastSLog2Slow(uint v)
{
DebugGuard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v));
+
if (v < ApproxLogWithCorrectionMax)
{
int logCnt = 0;
@@ -865,7 +1025,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
private static float FastLog2Slow(uint v)
{
- Guard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v));
+ DebugGuard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v));
if (v < ApproxLogWithCorrectionMax)
{
diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs
index bdb53f5c6a..bfb8f40d4a 100644
--- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs
@@ -3,10 +3,16 @@
using System;
using System.Collections.Generic;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
namespace SixLabors.ImageSharp.Formats.Webp.Lossless
{
- internal class Vp8LHistogram : IDeepCloneable
+ internal sealed class Vp8LHistogram : IDeepCloneable
{
private const uint NonTrivialSym = 0xffffffff;
@@ -505,11 +511,52 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossless
return cost;
}
- private static void AddVector(uint[] a, uint[] b, uint[] output, int size)
+ private static void AddVector(Span a, Span b, Span output, int count)
{
- for (int i = 0; i < size; i++)
+ DebugGuard.MustBeGreaterThanOrEqualTo(a.Length, count, nameof(a.Length));
+ DebugGuard.MustBeGreaterThanOrEqualTo(b.Length, count, nameof(b.Length));
+ DebugGuard.MustBeGreaterThanOrEqualTo(output.Length, count, nameof(output.Length));
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx2.IsSupported)
+ {
+ ref uint aRef = ref MemoryMarshal.GetReference(a);
+ ref uint bRef = ref MemoryMarshal.GetReference(b);
+ ref uint outputRef = ref MemoryMarshal.GetReference(output);
+ int i;
+
+ for (i = 0; i + 32 <= count; i += 32)
+ {
+ // Load values.
+ Vector256 a0 = Unsafe.As>(ref Unsafe.Add(ref aRef, i));
+ Vector256 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, i + 8));
+ Vector256 a2 = Unsafe.As>(ref Unsafe.Add(ref aRef, i + 16));
+ Vector256 a3 = Unsafe.As>(ref Unsafe.Add(ref aRef, i + 24));
+ Vector256 b0 = Unsafe.As>(ref Unsafe.Add(ref bRef, i));
+ Vector256 b1 = Unsafe.As>(ref Unsafe.Add(ref bRef, i + 8));
+ Vector256 b2 = Unsafe.As>(ref Unsafe.Add(ref bRef, i + 16));
+ Vector256 b3 = Unsafe.As>(ref Unsafe.Add(ref bRef, i + 24));
+
+ // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
+ // that's ok since the histogram values are less than 1<<28 (max picture count).
+ Unsafe.As>(ref Unsafe.Add(ref outputRef, i)) = Avx2.Add(a0, b0);
+ Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 8)) = Avx2.Add(a1, b1);
+ Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 16)) = Avx2.Add(a2, b2);
+ Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 24)) = Avx2.Add(a3, b3);
+ }
+
+ for (; i < count; i++)
+ {
+ output[i] = a[i] + b[i];
+ }
+ }
+ else
+#endif
{
- output[i] = a[i] + b[i];
+ for (int i = 0; i < count; i++)
+ {
+ output[i] = a[i] + b[i];
+ }
}
}
}
diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
index 2fcea8ceea..de6f807da2 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -726,7 +726,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
uint v = src[0] * 0x01010101u;
Span vSpan = BitConverter.GetBytes(v).AsSpan();
- for (int i = 0; i < 16; i++)
+ for (nint i = 0; i < 16; i++)
{
if (!src.Slice(0, 4).SequenceEqual(vSpan) || !src.Slice(4, 4).SequenceEqual(vSpan) ||
!src.Slice(8, 4).SequenceEqual(vSpan) || !src.Slice(12, 4).SequenceEqual(vSpan))
@@ -744,19 +744,21 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
private static bool IsFlat(Span levels, int numBlocks, int thresh)
{
int score = 0;
+ ref short levelsRef = ref MemoryMarshal.GetReference(levels);
+ int offset = 0;
while (numBlocks-- > 0)
{
- for (int i = 1; i < 16; i++)
+ for (nint i = 1; i < 16; i++)
{
// omit DC, we're only interested in AC
- score += levels[i] != 0 ? 1 : 0;
+ score += Unsafe.Add(ref levelsRef, offset) != 0 ? 1 : 0;
if (score > thresh)
{
return false;
}
}
- levels = levels.Slice(16);
+ offset += 16;
}
return true;
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs
index d62d23e172..14bc19e8a2 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Decoder.cs
@@ -76,10 +76,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
this.TmpVBuffer = memoryAllocator.Allocate((int)width);
this.Pixels = memoryAllocator.Allocate((int)(width * height * 4));
+#if DEBUG
+ // Filling those buffers with 205, is only useful for debugging,
+ // so the default values are the same as the reference libwebp implementation.
this.YuvBuffer.Memory.Span.Fill(205);
this.CacheY.Memory.Span.Fill(205);
this.CacheU.Memory.Span.Fill(205);
this.CacheV.Memory.Span.Fill(205);
+#endif
this.Vp8BitReaders = new Vp8BitReader[WebpConstants.MaxNumPartitions];
}
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index aa4ab5767b..f12a1a7855 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -15,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
///
/// Methods for encoding a VP8 frame.
///
- internal static class Vp8Encoding
+ internal static unsafe class Vp8Encoding
{
private const int KC1 = 20091 + (1 << 16);
@@ -66,11 +66,39 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 };
#if SUPPORTS_RUNTIME_INTRINSICS
- public static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16();
+#pragma warning disable SA1310 // Field names should not contain underscore
+ private static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16();
- public static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16();
+ private static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16();
- public static readonly Vector128 Four = Vector128.Create((short)4);
+ private static readonly Vector128 Four = Vector128.Create((short)4);
+
+ private static readonly Vector128 Seven = Vector128.Create((short)7);
+
+ private static readonly Vector128 K88p = Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16();
+
+ private static readonly Vector128 K88m = Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16();
+
+ private static readonly Vector128 K5352_2217p = Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16();
+
+ private static readonly Vector128 K5352_2217m = Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16();
+
+ private static readonly Vector128 K937 = Vector128.Create(937);
+
+ private static readonly Vector128 K1812 = Vector128.Create(1812);
+
+ private static readonly Vector128 K5352_2217 = Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16();
+
+ private static readonly Vector128 K2217_5352 = Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16();
+
+ private static readonly Vector128 K12000PlusOne = Vector128.Create(12000 + (1 << 16));
+
+ private static readonly Vector128 K51000 = Vector128.Create(51000);
+
+ private static readonly byte MmShuffle2301 = SimdUtils.Shuffle.MmShuffle(2, 3, 0, 1);
+
+ private static readonly byte MmShuffle1032 = SimdUtils.Shuffle.MmShuffle(1, 0, 3, 2);
+#pragma warning restore SA1310 // Field names should not contain underscore
#endif
static Vp8Encoding()
@@ -376,49 +404,246 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
public static void FTransform2(Span src, Span reference, Span output, Span output2, Span scratch)
{
- FTransform(src, reference, output, scratch);
- FTransform(src.Slice(4), reference.Slice(4), output2, scratch);
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Sse2.IsSupported)
+ {
+ ref byte srcRef = ref MemoryMarshal.GetReference(src);
+ ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
+
+ // Load src.
+ var src0 = Vector128.Create(Unsafe.As(ref srcRef), 0);
+ var src1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps)), 0);
+ var src2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 2)), 0);
+ var src3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 3)), 0);
+
+ // Load ref.
+ var ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0);
+ var ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0);
+ var ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0);
+ var ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0);
+
+ // Convert both to 16 bit.
+ Vector128 srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128.Zero);
+ Vector128 srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128.Zero);
+ Vector128 srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128.Zero);
+ Vector128 srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128.Zero);
+ Vector128 refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128.Zero);
+ Vector128 refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128.Zero);
+ Vector128 refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128.Zero);
+ Vector128 refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128.Zero);
+
+ // Compute difference. -> 00 01 02 03 00' 01' 02' 03'
+ Vector128 diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16());
+ Vector128 diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16());
+ Vector128 diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16());
+ Vector128 diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16());
+
+ // Unpack and shuffle.
+ // 00 01 02 03 0 0 0 0
+ // 10 11 12 13 0 0 0 0
+ // 20 21 22 23 0 0 0 0
+ // 30 31 32 33 0 0 0 0
+ Vector128 shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
+ Vector128 shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
+ Vector128 shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
+ Vector128 shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());
+
+ // First pass.
+ FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128 v01l, out Vector128 v32l);
+ FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128 v01h, out Vector128 v32h);
+
+ // Second pass.
+ FTransformPass2SSE2(v01l, v32l, output);
+ FTransformPass2SSE2(v01h, v32h, output2);
+ }
+ else
+#endif
+ {
+ FTransform(src, reference, output, scratch);
+ FTransform(src.Slice(4), reference.Slice(4), output2, scratch);
+ }
}
public static void FTransform(Span src, Span reference, Span output, Span scratch)
{
- int i;
- Span tmp = scratch.Slice(0, 16);
-
- int srcIdx = 0;
- int refIdx = 0;
- for (i = 0; i < 4; i++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Sse2.IsSupported)
{
- int d0 = src[srcIdx] - reference[refIdx]; // 9bit dynamic range ([-255,255])
- int d1 = src[srcIdx + 1] - reference[refIdx + 1];
- int d2 = src[srcIdx + 2] - reference[refIdx + 2];
- int d3 = src[srcIdx + 3] - reference[refIdx + 3];
- int a0 = d0 + d3; // 10b [-510,510]
- int a1 = d1 + d2;
- int a2 = d1 - d2;
- int a3 = d0 - d3;
- tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160]
- tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542]
- tmp[2 + (i * 4)] = (a0 - a1) * 8;
- tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9;
-
- srcIdx += WebpConstants.Bps;
- refIdx += WebpConstants.Bps;
- }
+ ref byte srcRef = ref MemoryMarshal.GetReference(src);
+ ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
- for (i = 0; i < 4; i++)
+ // Load src.
+ var src0 = Vector128.Create(Unsafe.As(ref srcRef), 0);
+ var src1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps)), 0);
+ var src2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 2)), 0);
+ var src3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref srcRef, WebpConstants.Bps * 3)), 0);
+
+ // Load ref.
+ var ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0);
+ var ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0);
+ var ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0);
+ var ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0);
+
+ // 00 01 02 03 *
+ // 10 11 12 13 *
+ // 20 21 22 23 *
+ // 30 31 32 33 *
+ // Shuffle.
+ Vector128 srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16());
+ Vector128 srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16());
+ Vector128 refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
+ Vector128 refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16());
+
+ // 00 01 10 11 02 03 12 13 * * ...
+ // 20 21 30 31 22 22 32 33 * * ...
+
+ // Convert both to 16 bit.
+ Vector128 src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128.Zero);
+ Vector128 src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128.Zero);
+ Vector128 ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128.Zero);
+ Vector128 ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128.Zero);
+
+ // Compute the difference.
+ Vector128 row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16());
+ Vector128 row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16());
+
+ // First pass.
+ FTransformPass1SSE2(row01, row23, out Vector128 v01, out Vector128 v32);
+
+ // Second pass.
+ FTransformPass2SSE2(v01, v32, output);
+ }
+ else
+#endif
{
- int a0 = tmp[0 + i] + tmp[12 + i]; // 15b
- int a1 = tmp[4 + i] + tmp[8 + i];
- int a2 = tmp[4 + i] - tmp[8 + i];
- int a3 = tmp[0 + i] - tmp[12 + i];
- output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b
- output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0));
- output[8 + i] = (short)((a0 - a1 + 7) >> 4);
- output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16);
+ int i;
+ Span tmp = scratch.Slice(0, 16);
+
+ int srcIdx = 0;
+ int refIdx = 0;
+ for (i = 0; i < 4; i++)
+ {
+ int d3 = src[srcIdx + 3] - reference[refIdx + 3];
+ int d2 = src[srcIdx + 2] - reference[refIdx + 2];
+ int d1 = src[srcIdx + 1] - reference[refIdx + 1];
+ int d0 = src[srcIdx] - reference[refIdx]; // 9bit dynamic range ([-255,255])
+ int a0 = d0 + d3; // 10b [-510,510]
+ int a1 = d1 + d2;
+ int a2 = d1 - d2;
+ int a3 = d0 - d3;
+ tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9;
+ tmp[2 + (i * 4)] = (a0 - a1) * 8;
+ tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542]
+ tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160]
+
+ srcIdx += WebpConstants.Bps;
+ refIdx += WebpConstants.Bps;
+ }
+
+ for (i = 0; i < 4; i++)
+ {
+ int t12 = tmp[12 + i]; // 15b
+ int t8 = tmp[8 + i];
+
+ int a1 = tmp[4 + i] + t8;
+ int a2 = tmp[4 + i] - t8;
+ int a0 = tmp[0 + i] + t12; // 15b
+ int a3 = tmp[0 + i] - t12;
+
+ output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16);
+ output[8 + i] = (short)((a0 - a1 + 7) >> 4);
+ output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0));
+ output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b
+ }
}
}
+#if SUPPORTS_RUNTIME_INTRINSICS
+ public static void FTransformPass1SSE2(Vector128 row01, Vector128 row23, out Vector128 out01, out Vector128 out32)
+ {
+ // *in01 = 00 01 10 11 02 03 12 13
+ // *in23 = 20 21 30 31 22 23 32 33
+ Vector128 shuf01_p = Sse2.ShuffleHigh(row01, MmShuffle2301);
+ Vector128 shuf32_p = Sse2.ShuffleHigh(row23, MmShuffle2301);
+
+ // 00 01 10 11 03 02 13 12
+ // 20 21 30 31 23 22 33 32
+ Vector128 s01 = Sse2.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64());
+ Vector128 s32 = Sse2.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64());
+
+ // 00 01 10 11 20 21 30 31
+ // 03 02 13 12 23 22 33 32
+ Vector128 a01 = Sse2.Add(s01.AsInt16(), s32.AsInt16());
+ Vector128 a32 = Sse2.Subtract(s01.AsInt16(), s32.AsInt16());
+
+ // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
+ // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
+ Vector128 tmp0 = Sse2.MultiplyAddAdjacent(a01, K88p); // [ (a0 + a1) << 3, ... ]
+ Vector128 tmp2 = Sse2.MultiplyAddAdjacent(a01, K88m); // [ (a0 - a1) << 3, ... ]
+ Vector128 tmp11 = Sse2.MultiplyAddAdjacent(a32, K5352_2217p);
+ Vector128 tmp31 = Sse2.MultiplyAddAdjacent(a32, K5352_2217m);
+ Vector128 tmp12 = Sse2.Add(tmp11, K1812);
+ Vector128 tmp32 = Sse2.Add(tmp31, K937);
+ Vector128 tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9);
+ Vector128 tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9);
+ Vector128 s03 = Sse2.PackSignedSaturate(tmp0, tmp2);
+ Vector128 s12 = Sse2.PackSignedSaturate(tmp1, tmp3);
+ Vector128 slo = Sse2.UnpackLow(s03, s12); // 0 1 0 1 0 1...
+ Vector128 shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3
+ Vector128 v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32());
+ out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32());
+ out32 = Sse2.Shuffle(v23, MmShuffle1032);
+ }
+
+ public static void FTransformPass2SSE2(Vector128 v01, Vector128 v32, Span output)
+ {
+ // Same operations are done on the (0,3) and (1,2) pairs.
+ // a3 = v0 - v3
+ // a2 = v1 - v2
+ Vector128 a32 = Sse2.Subtract(v01.AsInt16(), v32.AsInt16());
+ Vector128 a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64());
+
+ Vector128 b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16());
+ Vector128 c1 = Sse2.MultiplyAddAdjacent(b23, K5352_2217);
+ Vector128 c3 = Sse2.MultiplyAddAdjacent(b23, K2217_5352);
+ Vector128 d1 = Sse2.Add(c1, K12000PlusOne);
+ Vector128 d3 = Sse2.Add(c3, K51000);
+ Vector128 e1 = Sse2.ShiftRightArithmetic(d1, 16);
+ Vector128 e3 = Sse2.ShiftRightArithmetic(d3, 16);
+
+ // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+ // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
+ Vector128 f1 = Sse2.PackSignedSaturate(e1, e1);
+ Vector128 f3 = Sse2.PackSignedSaturate(e3, e3);
+
+ // g1 = f1 + (a3 != 0);
+ // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
+ // desired (0, 1), we add one earlier through k12000_plus_one.
+ // -> g1 = f1 + 1 - (a3 == 0)
+ Vector128 g1 = Sse2.Add(f1, Sse2.CompareEqual(a32, Vector128.Zero));
+
+ // a0 = v0 + v3
+ // a1 = v1 + v2
+ Vector128 a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16());
+ Vector128 a01Plus7 = Sse2.Add(a01.AsInt16(), Seven);
+ Vector128 a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
+ Vector128 c0 = Sse2.Add(a01Plus7, a11);
+ Vector128 c2 = Sse2.Subtract(a01Plus7, a11);
+
+ // d0 = (a0 + a1 + 7) >> 4;
+ // d2 = (a0 - a1 + 7) >> 4;
+ Vector128 d0 = Sse2.ShiftRightArithmetic(c0, 4);
+ Vector128 d2 = Sse2.ShiftRightArithmetic(c2, 4);
+
+ Vector128 d0g1 = Sse2.UnpackLow(d0.AsInt64(), g1.AsInt64());
+ Vector128 d2f3 = Sse2.UnpackLow(d2.AsInt64(), f3.AsInt64());
+
+ ref short outputRef = ref MemoryMarshal.GetReference(output);
+ Unsafe.As>(ref outputRef) = d0g1.AsInt16();
+ Unsafe.As>(ref Unsafe.Add(ref outputRef, 8)) = d2f3.AsInt16();
+ }
+#endif
+
public static void FTransformWht(Span input, Span output, Span scratch)
{
Span tmp = scratch.Slice(0, 16);
@@ -427,32 +652,37 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
int inputIdx = 0;
for (i = 0; i < 4; i++)
{
- int a0 = input[inputIdx + (0 * 16)] + input[inputIdx + (2 * 16)]; // 13b
int a1 = input[inputIdx + (1 * 16)] + input[inputIdx + (3 * 16)];
int a2 = input[inputIdx + (1 * 16)] - input[inputIdx + (3 * 16)];
+ int a0 = input[inputIdx + (0 * 16)] + input[inputIdx + (2 * 16)]; // 13b
int a3 = input[inputIdx + (0 * 16)] - input[inputIdx + (2 * 16)];
- tmp[0 + (i * 4)] = a0 + a1; // 14b
- tmp[1 + (i * 4)] = a3 + a2;
- tmp[2 + (i * 4)] = a3 - a2;
tmp[3 + (i * 4)] = a0 - a1;
+ tmp[2 + (i * 4)] = a3 - a2;
+ tmp[1 + (i * 4)] = a3 + a2;
+ tmp[0 + (i * 4)] = a0 + a1; // 14b
inputIdx += 64;
}
for (i = 0; i < 4; i++)
{
- int a0 = tmp[0 + i] + tmp[8 + i]; // 15b
- int a1 = tmp[4 + i] + tmp[12 + i];
- int a2 = tmp[4 + i] - tmp[12 + i];
- int a3 = tmp[0 + i] - tmp[8 + i];
+ int t12 = tmp[12 + i];
+ int t8 = tmp[8 + i];
+
+ int a1 = tmp[4 + i] + t12;
+ int a2 = tmp[4 + i] - t12;
+ int a0 = tmp[0 + i] + t8; // 15b
+ int a3 = tmp[0 + i] - t8;
+
int b0 = a0 + a1; // 16b
int b1 = a3 + a2;
int b2 = a3 - a2;
int b3 = a0 - a1;
- output[0 + i] = (short)(b0 >> 1); // 15b
- output[4 + i] = (short)(b1 >> 1);
- output[8 + i] = (short)(b2 >> 1);
+
output[12 + i] = (short)(b3 >> 1);
+ output[8 + i] = (short)(b2 >> 1);
+ output[4 + i] = (short)(b1 >> 1);
+ output[0 + i] = (short)(b0 >> 1); // 15b
}
}
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
index 6e724e4758..d384302b94 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
@@ -6,7 +6,7 @@ using System.Runtime.CompilerServices;
namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
- internal class Vp8Histogram
+ internal sealed class Vp8Histogram
{
private readonly int[] scratch = new int[16];
@@ -49,7 +49,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
this.distribution.AsSpan().Clear();
for (j = startBlock; j < endBlock; j++)
{
- this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output);
+ Vp8Encoding.FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output, this.scratch);
// Convert coefficients to bin.
for (int k = 0; k < 16; ++k)
@@ -98,48 +98,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
this.lastNonZero = lastNonZero;
}
- private void Vp8FTransform(Span src, Span reference, Span output)
- {
- int i;
- Span tmp = this.scratch;
- tmp.Clear();
-
- for (i = 0; i < 4; i++)
- {
- int d0 = src[0] - reference[0]; // 9bit dynamic range ([-255,255])
- int d1 = src[1] - reference[1];
- int d2 = src[2] - reference[2];
- int d3 = src[3] - reference[3];
- int a0 = d0 + d3; // 10b [-510,510]
- int a1 = d1 + d2;
- int a2 = d1 - d2;
- int a3 = d0 - d3;
- tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160]
- tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542]
- tmp[2 + (i * 4)] = (a0 - a1) * 8;
- tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9;
-
- // Do not change the span in the last iteration.
- if (i < 3)
- {
- src = src.Slice(WebpConstants.Bps);
- reference = reference.Slice(WebpConstants.Bps);
- }
- }
-
- for (i = 0; i < 4; i++)
- {
- int a0 = tmp[0 + i] + tmp[12 + i]; // 15b
- int a1 = tmp[4 + i] + tmp[8 + i];
- int a2 = tmp[4 + i] - tmp[8 + i];
- int a3 = tmp[0 + i] - tmp[12 + i];
- output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b
- output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0));
- output[8 + i] = (short)((a0 - a1 + 7) >> 4);
- output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16);
- }
- }
-
[MethodImpl(InliningOptions.ShortMethod)]
private static int ClipMax(int v, int max) => v > max ? max : v;
}
diff --git a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
index 2f78842c63..202df9039e 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
@@ -692,16 +692,17 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
int mbw = io.MbW;
int uvw = (mbw + 1) / 2;
int y = io.MbY;
+ byte[] uvBuffer = new byte[(14 * 32) + 15];
if (y == 0)
{
// First line is special cased. We mirror the u/v samples at boundary.
- this.UpSample(curY, null, curU, curV, curU, curV, dst, null, mbw);
+ YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst, default, mbw, uvBuffer);
}
else
{
// We can finish the left-over line from previous call.
- this.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw);
+ YuvConversion.UpSample(tmpYBuffer, curY, topU, topV, curU, curV, buf.Slice(dstStartIdx - bufferStride), dst, mbw, uvBuffer);
numLinesOut++;
}
@@ -714,7 +715,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
topV = curV;
curU = curU.Slice(io.UvStride);
curV = curV.Slice(io.UvStride);
- this.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw);
+ YuvConversion.UpSample(curY.Slice(io.YStride), curY.Slice(ioStride2), topU, topV, curU, curV, dst.Slice(bufferStride), dst.Slice(bufferStride2), mbw, uvBuffer);
curY = curY.Slice(ioStride2);
dst = dst.Slice(bufferStride2);
}
@@ -736,67 +737,13 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// Process the very last row of even-sized picture.
if ((yEnd & 1) == 0)
{
- this.UpSample(curY, null, curU, curV, curU, curV, dst.Slice(bufferStride), null, mbw);
+ YuvConversion.UpSample(curY, default, curU, curV, curU, curV, dst.Slice(bufferStride), default, mbw, uvBuffer);
}
}
return numLinesOut;
}
- private void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len)
- {
- int xStep = 3;
- int lastPixelPair = (len - 1) >> 1;
- uint tluv = YuvConversion.LoadUv(topU[0], topV[0]); // top-left sample
- uint luv = YuvConversion.LoadUv(curU[0], curV[0]); // left-sample
- uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
- YuvConversion.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst);
-
- if (bottomY != null)
- {
- uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
- YuvConversion.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst);
- }
-
- for (int x = 1; x <= lastPixelPair; x++)
- {
- uint tuv = YuvConversion.LoadUv(topU[x], topV[x]); // top sample
- uint uv = YuvConversion.LoadUv(curU[x], curV[x]); // sample
-
- // Precompute invariant values associated with first and second diagonals.
- uint avg = tluv + tuv + luv + uv + 0x00080008u;
- uint diag12 = (avg + (2 * (tuv + luv))) >> 3;
- uint diag03 = (avg + (2 * (tluv + uv))) >> 3;
- uv0 = (diag12 + tluv) >> 1;
- uint uv1 = (diag03 + tuv) >> 1;
- int xMul2 = x * 2;
- YuvConversion.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep));
- YuvConversion.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep));
-
- if (bottomY != null)
- {
- uv0 = (diag03 + luv) >> 1;
- uv1 = (diag12 + uv) >> 1;
- YuvConversion.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep));
- YuvConversion.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep));
- }
-
- tluv = tuv;
- luv = uv;
- }
-
- if ((len & 1) == 0)
- {
- uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
- YuvConversion.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep));
- if (bottomY != null)
- {
- uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
- YuvConversion.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep));
- }
- }
- }
-
private void DoTransform(uint bits, Span src, Span dst, Span scratch)
{
switch (bits >> 30)
diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
index a9cf876c80..16d458ed88 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
@@ -4,6 +4,11 @@
using System;
using System.Buffers;
using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
@@ -18,6 +23,291 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
private const int YuvHalf = 1 << (YuvFix - 1);
+#if SUPPORTS_RUNTIME_INTRINSICS
+ private static readonly Vector128 One = Vector128.Create((byte)1);
+
+ // These constants are 14b fixed-point version of ITU-R BT.601 constants.
+ // R = (19077 * y + 26149 * v - 14234) >> 6
+ // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
+ // B = (19077 * y + 33050 * u - 17685) >> 6
+ private static readonly Vector128 K19077 = Vector128.Create((short)19077).AsByte();
+
+ private static readonly Vector128 K26149 = Vector128.Create((short)26149).AsByte();
+
+ private static readonly Vector128 K14234 = Vector128.Create((short)14234).AsByte();
+
+ // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
+ private static readonly Vector128 K33050 = Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129);
+
+ private static readonly Vector128 K17685 = Vector128.Create((short)17685).AsByte();
+
+ private static readonly Vector128 K6419 = Vector128.Create((short)6419).AsByte();
+
+ private static readonly Vector128 K13320 = Vector128.Create((short)13320).AsByte();
+
+ private static readonly Vector128 K8708 = Vector128.Create((short)8708).AsByte();
+
+ private static readonly Vector128 PlanarTo24Shuffle0 = Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5);
+
+ private static readonly Vector128 PlanarTo24Shuffle1 = Vector128.Create(255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10, 255);
+
+ private static readonly Vector128 PlanarTo24Shuffle2 = Vector128.Create(255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255, 255);
+
+ private static readonly Vector128 PlanarTo24Shuffle3 = Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255);
+
+ private static readonly Vector128 PlanarTo24Shuffle4 = Vector128.Create(5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10);
+
+ private static readonly Vector128 PlanarTo24Shuffle5 = Vector128.Create(255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255);
+
+ private static readonly Vector128 PlanarTo24Shuffle6 = Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255);
+
+ private static readonly Vector128 PlanarTo24Shuffle7 = Vector128.Create(255, 5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255);
+
+ private static readonly Vector128 PlanarTo24Shuffle8 = Vector128.Create(10, 255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15);
+#endif
+
+ // UpSample from YUV to RGB.
+ // Given samples laid out in a square as:
+ // [a b]
+ // [c d]
+ // we interpolate u/v as:
+ // ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16
+ // ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
+ public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer)
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Sse41.IsSupported)
+ {
+ UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
+ }
+ else
+#endif
+ {
+ UpSampleScalar(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len);
+ }
+ }
+
+ private static void UpSampleScalar(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len)
+ {
+ int xStep = 3;
+ int lastPixelPair = (len - 1) >> 1;
+ uint tluv = LoadUv(topU[0], topV[0]); // top-left sample
+ uint luv = LoadUv(curU[0], curV[0]); // left-sample
+ uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
+ YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst);
+
+ if (bottomY != default)
+ {
+ uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
+ YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst);
+ }
+
+ for (int x = 1; x <= lastPixelPair; x++)
+ {
+ uint tuv = LoadUv(topU[x], topV[x]); // top sample
+ uint uv = LoadUv(curU[x], curV[x]); // sample
+
+ // Precompute invariant values associated with first and second diagonals.
+ uint avg = tluv + tuv + luv + uv + 0x00080008u;
+ uint diag12 = (avg + (2 * (tuv + luv))) >> 3;
+ uint diag03 = (avg + (2 * (tluv + uv))) >> 3;
+ uv0 = (diag12 + tluv) >> 1;
+ uint uv1 = (diag03 + tuv) >> 1;
+ int xMul2 = x * 2;
+ YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep));
+ YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep));
+
+ if (bottomY != default)
+ {
+ uv0 = (diag03 + luv) >> 1;
+ uv1 = (diag12 + uv) >> 1;
+ YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep));
+ YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep));
+ }
+
+ tluv = tuv;
+ luv = uv;
+ }
+
+ if ((len & 1) == 0)
+ {
+ uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
+ YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep));
+ if (bottomY != default)
+ {
+ uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
+ YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep));
+ }
+ }
+ }
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ // We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
+ // u = (9*a + 3*b + 3*c + d + 8) / 16
+ // = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
+ // = (a + m + 1) / 2
+ // where m = (a + 3*b + 3*c + d) / 8
+ // = ((a + b + c + d) / 2 + b + c) / 4
+ //
+ // Let's say k = (a + b + c + d) / 4.
+ // We can compute k as
+ // k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
+ // where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
+ //
+ // Then m can be written as
+ // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
+ private static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer)
+ {
+ const int xStep = 3;
+ Array.Clear(uvBuffer, 0, uvBuffer.Length);
+ Span ru = uvBuffer.AsSpan(15);
+ Span rv = ru.Slice(32);
+
+ // Treat the first pixel in regular way.
+ int uDiag = ((topU[0] + curU[0]) >> 1) + 1;
+ int vDiag = ((topV[0] + curV[0]) >> 1) + 1;
+ int u0t = (topU[0] + uDiag) >> 1;
+ int v0t = (topV[0] + vDiag) >> 1;
+ YuvToBgr(topY[0], u0t, v0t, topDst);
+ if (bottomY != default)
+ {
+ int u0b = (curU[0] + uDiag) >> 1;
+ int v0b = (curV[0] + vDiag) >> 1;
+ YuvToBgr(bottomY[0], u0b, v0b, bottomDst);
+ }
+
+ // For UpSample32Pixels, 17 u/v values must be read-able for each block.
+ int pos;
+ int uvPos;
+ ref byte topURef = ref MemoryMarshal.GetReference(topU);
+ ref byte topVRef = ref MemoryMarshal.GetReference(topV);
+ ref byte curURef = ref MemoryMarshal.GetReference(curU);
+ ref byte curVRef = ref MemoryMarshal.GetReference(curV);
+ if (bottomY != null)
+ {
+ for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
+ {
+ UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru);
+ UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv);
+ ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
+ }
+ }
+ else
+ {
+ for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
+ {
+ UpSample32Pixels(ref Unsafe.Add(ref topURef, uvPos), ref Unsafe.Add(ref curURef, uvPos), ru);
+ UpSample32Pixels(ref Unsafe.Add(ref topVRef, uvPos), ref Unsafe.Add(ref curVRef, uvPos), rv);
+ ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep);
+ }
+ }
+
+ // Process last block.
+ if (len > 1)
+ {
+ int leftOver = ((len + 1) >> 1) - (pos >> 1);
+ Span tmpTopDst = ru.Slice(4 * 32);
+ Span tmpBottomDst = tmpTopDst.Slice(4 * 32);
+ Span tmpTop = tmpBottomDst.Slice(4 * 32);
+ Span tmpBottom = (bottomY == null) ? null : tmpTop.Slice(32);
+ UpSampleLastBlock(topU.Slice(uvPos), curU.Slice(uvPos), leftOver, ru);
+ UpSampleLastBlock(topV.Slice(uvPos), curV.Slice(uvPos), leftOver, rv);
+
+ topY.Slice(pos, len - pos).CopyTo(tmpTop);
+ if (bottomY != default)
+ {
+ bottomY.Slice(pos, len - pos).CopyTo(tmpBottom);
+ ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
+ }
+ else
+ {
+ ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep);
+ }
+
+ tmpTopDst.Slice(0, (len - pos) * xStep).CopyTo(topDst.Slice(pos * xStep));
+ if (bottomY != default)
+ {
+ tmpBottomDst.Slice(0, (len - pos) * xStep).CopyTo(bottomDst.Slice(pos * xStep));
+ }
+ }
+ }
+
+ // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
+ private static void UpSample32Pixels(ref byte r1, ref byte r2, Span