diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index b2bedb87b4..f09530d6b2 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -547,5 +547,131 @@ namespace SixLabors.ImageSharp
}
}
}
+
+ ///
+ /// Calculates the cube pow of all the XYZ channels of the input vectors.
+ ///
+ /// The span of vectors
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static unsafe void CubePowOnXYZ(Span vectors)
+ {
+ ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+ int length = vectors.Length;
+
+ for (int x = 0; x < length; x++)
+ {
+ ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
+ Vector4 v = pixel4;
+ float a = v.W;
+
+ // Fast path for the default gamma exposure, which is 3. In this case we can skip
+ // calling Math.Pow 3 times (one per component), as the method is an internal call and
+ // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
+ // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
+ // back to the target index in the temporary span. The whole iteration will get completely
+ // inlined and traslated into vectorized instructions, with much better performance.
+ v = v * v * v;
+ v.W = a;
+
+ pixel4 = v;
+ }
+ }
+
+ ///
+ /// Calculates the cube root of all the XYZ channels of the input vectors.
+ ///
+ /// The span of vectors
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static unsafe void CubeRootOnXYZ(Span vectors)
+ {
+ ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
+ int length = vectors.Length;
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Sse41.IsSupported)
+ {
+ var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF);
+ var v128_0x3F8000000 = Vector128.Create(0x3F800000);
+ var v128_341 = Vector128.Create(341);
+ var v128_0x80000000 = Vector128.Create(unchecked((int)0x80000000));
+ var v4_23rds = new Vector4(2 / 3f);
+ var v4_13rds = new Vector4(1 / 3f);
+
+ for (int x = 0; x < length; x++)
+ {
+ ref Vector4 v4 = ref Unsafe.Add(ref vectorsRef, x);
+
+ Vector4 vx = v4;
+ float a = vx.W;
+ Vector128 veax = Unsafe.As>(ref vx);
+ Vector128 vecx = veax;
+
+ // If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it
+ // directly on 32 bit floating point values. What follows is a vectorized implementation of this method:
+ // https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html.
+ // Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here
+ // using a different succession (the same used below), which should be less unstable due to not having cube pow.
+ veax = Sse2.And(veax, v128_0x7FFFFFFF);
+ veax = Sse2.Subtract(veax, v128_0x3F8000000);
+ veax = Sse2.ShiftRightArithmetic(veax, 10);
+ veax = Sse41.MultiplyLow(veax, v128_341);
+ veax = Sse2.Add(veax, v128_0x3F8000000);
+ veax = Sse2.And(veax, v128_0x7FFFFFFF);
+ vecx = Sse2.And(vecx, v128_0x80000000);
+ veax = Sse2.Or(veax, vecx);
+
+ Vector4 y4 = *(Vector4*)&veax;
+
+ y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4)));
+ y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4)));
+ y4.W = a;
+
+ v4 = y4;
+ }
+
+ return;
+ }
+#else
+ for (int x = 0; x < length; x++)
+ {
+ ref Vector4 v = ref Unsafe.Add(ref vectorsRef, x);
+
+ double
+ x64 = v.X,
+ y64 = v.Y,
+ z64 = v.Z;
+ float a = v.W;
+
+ ulong
+ xl = *(ulong*)&x64,
+ yl = *(ulong*)&y64,
+ zl = *(ulong*)&z64;
+
+ // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
+ // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
+ // this means what we actually want is to find the cube root of our clamped values.
+ // For more info on the constant below, see:
+ // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
+ // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
+ // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
+ // register, and use it to accelerate two steps of the Newton approximation using SIMD.
+ xl = 0x2a9f8a7be393b600 + (xl / 3);
+ yl = 0x2a9f8a7be393b600 + (yl / 3);
+ zl = 0x2a9f8a7be393b600 + (zl / 3);
+
+ Vector4 y4;
+ y4.X = (float)*(double*)&xl;
+ y4.Y = (float)*(double*)&yl;
+ y4.Z = (float)*(double*)&zl;
+ y4.W = 0;
+
+ y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+ y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+ y4.W = a;
+
+ v = y4;
+ }
+#endif
+ }
}
}
diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
index 02308d3fb1..a21155e10c 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@@ -331,27 +331,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
public void Invoke(int y, Span span)
{
Span targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
+
PixelOperations.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply);
- ref Vector4 baseRef = ref MemoryMarshal.GetReference(span);
- int length = this.bounds.Width;
- for (int x = 0; x < length; x++)
- {
- ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
- Vector4 v = pixel4;
- float a = v.W;
-
- // Fast path for the default gamma exposure, which is 3. In this case we can skip
- // calling Math.Pow 3 times (one per component), as the method is an internal call and
- // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
- // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
- // back to the target index in the temporary span. The whole iteration will get completely
- // inlined and traslated into vectorized instructions, with much better performance.
- v = v * v * v;
- v.W = a;
-
- pixel4 = v;
- }
+ Numerics.CubePowOnXYZ(span);
PixelOperations.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
}
@@ -438,47 +421,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
Numerics.Clamp(MemoryMarshal.Cast(sourceRowSpan), 0, float.PositiveInfinity);
+ Numerics.CubeRootOnXYZ(sourceRowSpan);
Span targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
- int length = this.bounds.Width;
-
- for (int x = 0; x < length; x++)
- {
- ref Vector4 v = ref Unsafe.Add(ref sourceRef, x);
-
- double
- x64 = v.X,
- y64 = v.Y,
- z64 = v.Z;
- float a = v.W;
-
- ulong
- xl = *(ulong*)&x64,
- yl = *(ulong*)&y64,
- zl = *(ulong*)&z64;
-
- // Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root
- // of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the
- // constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform
- // the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set
- // these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD.
- // As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values.
- xl = 0x2a9f8a7be393b600 + (xl / 3);
- yl = 0x2a9f8a7be393b600 + (yl / 3);
- zl = 0x2a9f8a7be393b600 + (zl / 3);
-
- Vector4 y4;
- y4.X = (float)*(double*)&xl;
- y4.Y = (float)*(double*)&yl;
- y4.Z = (float)*(double*)&zl;
- y4.W = 0;
-
- y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
- y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
- y4.W = a;
-
- v = y4;
- }
PixelOperations.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
}