Reimplement @Sergio0694 work.

2 years ago · cd1b77a88f
8 changed files with 257 additions and 111 deletions
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@ -1097,4 +1097,51 @@ internal static class Numerics
    public static nuint Vector512Count<TVector>(int length)
        where TVector : struct
        => (uint)length / (uint)Vector512<TVector>.Count;
+
+    /// <summary>
+    /// Normalizes the values in a given <see cref="Span{T}"/>.
+    /// </summary>
+    /// <param name="span">The sequence of <see cref="float"/> values to normalize.</param>
+    /// <param name="sum">The sum of the values in <paramref name="span"/>.</param>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static void Normalize(Span<float> span, float sum)
+    {
+        if (Vector256.IsHardwareAccelerated)
+        {
+            ref float startRef = ref MemoryMarshal.GetReference(span);
+            ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7);
+            Vector256<float> sum256 = Vector256.Create(sum);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                Unsafe.As<float, Vector256<float>>(ref startRef) /= sum256;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)8);
+            }
+
+            if ((span.Length & 7) >= 4)
+            {
+                Unsafe.As<float, Vector128<float>>(ref startRef) /= sum256.GetLower();
+                startRef = ref Unsafe.Add(ref startRef, (nuint)4);
+            }
+
+            endRef = ref Unsafe.Add(ref startRef, span.Length & 3);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                startRef /= sum;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)1);
+            }
+        }
+        else
+        {
+            ref float startRef = ref MemoryMarshal.GetReference(span);
+            ref float endRef = ref Unsafe.Add(ref startRef, span.Length);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                startRef /= sum;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)1);
+            }
+        }
+    }
 }
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@ -245,6 +245,44 @@ internal static class Vector128Utilities
        return default;
    }

+    /// <summary>
+    /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
+    /// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
+    /// corresponding element in <paramref name="c"/>.
+    /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
+    /// fused operation for better performance and precision.
+    /// </summary>
+    /// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
+    /// <paramref name="a"/> and <paramref name="b"/>.</param>
+    /// <returns>
+    /// A <see cref="Vector128{Single}"/> where each element is the result of multiplying the corresponding elements
+    /// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
+    /// </returns>
+    /// <remarks>
+    /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
+    /// <see cref="Fma.MultiplyAdd(Vector128{float}, Vector128{float}, Vector128{float})"/>. This approach can result
+    /// in slightly different results compared to performing the multiplication and addition separately due to
+    /// differences in how floating-point
+    /// rounding is handled.
+    /// <para>
+    /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
+    /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
+    /// is critical.
+    /// </para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<float> MultiplyAdd(Vector128<float> a, Vector128<float> b, Vector128<float> c)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplyAdd(a, b, c);
+        }
+
+        return (a * b) + c;
+    }
+
    [DoesNotReturn]
    private static void ThrowUnreachableException() => throw new UnreachableException();
 }
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@ -110,6 +110,44 @@ internal static class Vector256Utilities
        return Vector256.ConvertToInt32(val_2p23_f32 | sign);
    }

+    /// <summary>
+    /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
+    /// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
+    /// corresponding element in <paramref name="c"/>.
+    /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
+    /// fused operation for better performance and precision.
+    /// </summary>
+    /// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
+    /// <paramref name="a"/> and <paramref name="b"/>.</param>
+    /// <returns>
+    /// A <see cref="Vector256{Single}"/> where each element is the result of multiplying the corresponding elements
+    /// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
+    /// </returns>
+    /// <remarks>
+    /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
+    /// <see cref="Fma.MultiplyAdd(Vector256{float}, Vector256{float}, Vector256{float})"/>. This approach can result
+    /// in slightly different results compared to performing the multiplication and addition separately due to
+    /// differences in how floating-point
+    /// rounding is handled.
+    /// <para>
+    /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
+    /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
+    /// is critical.
+    /// </para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<float> MultiplyAdd(Vector256<float> a, Vector256<float> b, Vector256<float> c)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplyAdd(a, b, c);
+        }
+
+        return (a * b) + c;
+    }
+
    [DoesNotReturn]
    private static void ThrowUnreachableException() => throw new UnreachableException();
 }
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@ -5,7 +5,7 @@ using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;

 namespace SixLabors.ImageSharp.Processing.Processors.Transforms;

@ -14,6 +14,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
 /// </summary>
 internal readonly unsafe struct ResizeKernel
 {
+    /// <summary>
+    /// The buffer with the convolution factors.
+    /// Note that when FMA is supported, this is of size 4x that reported in <see cref="Length"/>.
+    /// </summary>
    private readonly float* bufferPtr;

    /// <summary>
@ -53,7 +57,15 @@ internal readonly unsafe struct ResizeKernel
    public Span<float> Values
    {
        [MethodImpl(InliningOptions.ShortMethod)]
-        get => new(this.bufferPtr, this.Length);
+        get
+        {
+            if (Vector256.IsHardwareAccelerated)
+            {
+                return new(this.bufferPtr, this.Length * 4);
+            }
+
+            return new(this.bufferPtr, this.Length);
+        }
    }

    /// <summary>
@ -68,70 +80,42 @@ internal readonly unsafe struct ResizeKernel
    [MethodImpl(InliningOptions.ShortMethod)]
    public Vector4 ConvolveCore(ref Vector4 rowStartRef)
    {
-        if (Avx2.IsSupported && Fma.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
        {
            float* bufferStart = this.bufferPtr;
-            float* bufferEnd = bufferStart + (this.Length & ~3);
+            ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3);
            Vector256<float> result256_0 = Vector256<float>.Zero;
            Vector256<float> result256_1 = Vector256<float>.Zero;
-            ReadOnlySpan<byte> maskBytes = new byte[]
-            {
-                0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0,
-                1, 0, 0, 0, 1, 0, 0, 0,
-                1, 0, 0, 0, 1, 0, 0, 0,
-            };
-            Vector256<int> mask = Unsafe.ReadUnaligned<Vector256<int>>(ref MemoryMarshal.GetReference(maskBytes));

-            while (bufferStart < bufferEnd)
+            while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
            {
-                // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
-                // for the FMA operation, and execute it directly on the target register and reading directly from
-                // memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
-                // The code below should compile in the following assembly on .NET 5 x64:
-                //
-                // vmovsd xmm2, [rax]               ; load *(double*)bufferStart into xmm2 as [ab, _]
-                // vpermps ymm2, ymm1, ymm2         ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
-                // vfmadd231ps ymm0, ymm2, [r8]     ; result256_0 = FMA(pixels, factors) + result256_0
-                //
-                // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
-                // Additionally, we're also unrolling two computations per each loop iterations to leverage the
-                // fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
-                result256_0 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
-                    Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
-                    result256_0);
-
-                result256_1 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)),
-                    Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask),
-                    result256_1);
-
-                bufferStart += 4;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, 4);
+                Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                Vector256<float> pixels256_1 = Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)2));
+
+                result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0);
+                result256_1 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart + 8), pixels256_1, result256_1);
+
+                bufferStart += 16;
+                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
            }

-            result256_0 = Avx.Add(result256_0, result256_1);
+            result256_0 += result256_1;

            if ((this.Length & 3) >= 2)
            {
-                result256_0 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
-                    Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
-                    result256_0);
+                Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0);

-                bufferStart += 2;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
+                bufferStart += 8;
+                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
            }

-            Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());
+            Vector128<float> result128 = result256_0.GetLower() + result256_0.GetUpper();

            if ((this.Length & 1) != 0)
            {
-                result128 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef),
-                    Vector128.Create(*bufferStart),
-                    result128);
+                Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
+                result128 = Vector128Utilities.MultiplyAdd(Vector128.Load(bufferStart), pixels128, result128);
            }

            return *(Vector4*)&result128;
@ -149,7 +133,7 @@ internal readonly unsafe struct ResizeKernel
                result += rowStartRef * *bufferStart;

                bufferStart++;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
+                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)1);
            }

            return result;
@ -164,13 +148,30 @@ internal readonly unsafe struct ResizeKernel
    internal ResizeKernel AlterLeftValue(int left)
        => new(left, this.bufferPtr, this.Length);

-    internal void Fill(Span<double> values)
+    internal void FillOrCopyAndExpand(Span<float> values)
    {
        DebugGuard.IsTrue(values.Length == this.Length, nameof(values), "ResizeKernel.Fill: values.Length != this.Length!");

-        for (int i = 0; i < this.Length; i++)
+        if (Vector256.IsHardwareAccelerated)
        {
-            this.Values[i] = (float)values[i];
+            Vector4* bufferStart = (Vector4*)this.bufferPtr;
+            ref float valuesStart = ref MemoryMarshal.GetReference(values);
+            ref float valuesEnd = ref Unsafe.Add(ref valuesStart, values.Length);
+
+            while (Unsafe.IsAddressLessThan(ref valuesStart, ref valuesEnd))
+            {
+                *bufferStart = new Vector4(valuesStart);
+
+                bufferStart++;
+                valuesStart = ref Unsafe.Add(ref valuesStart, (nuint)1);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < this.Length; i++)
+            {
+                this.Values[i] = (float)values[i];
+            }
        }
    }
 }
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs
@ -54,7 +54,7 @@ internal partial class ResizeKernelMap
            int bottomStartDest = this.DestinationLength - this.cornerInterval;
            for (int i = startOfFirstRepeatedMosaic; i < bottomStartDest; i++)
            {
-                double center = ((i + .5) * this.ratio) - .5;
+                float center = (float)(((i + .5) * this.ratio) - .5);
                int left = (int)TolerantMath.Ceiling(center - this.radius);
                ResizeKernel kernel = this.kernels[i - this.period];
                this.kernels[i] = kernel.AlterLeftValue(left);
--- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
+++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
@ -5,6 +5,7 @@ using System.Buffers;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
 using SixLabors.ImageSharp.Memory;

 namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
@ -33,7 +34,7 @@ internal partial class ResizeKernelMap : IDisposable
    private bool isDisposed;

    // To avoid both GC allocations, and MemoryAllocator ceremony:
-    private readonly double[] tempValues;
+    private readonly float[] tempValues;

    private ResizeKernelMap(
        MemoryAllocator memoryAllocator,
@ -50,10 +51,19 @@ internal partial class ResizeKernelMap : IDisposable
        this.sourceLength = sourceLength;
        this.DestinationLength = destinationLength;
        this.MaxDiameter = (radius * 2) + 1;
-        this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true, AllocationOptions.Clean);
+
+        if (Vector256.IsHardwareAccelerated)
+        {
+            this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter * 4, bufferHeight, preferContiguosImageBuffers: true);
+        }
+        else
+        {
+            this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true);
+        }
+
        this.pinHandle = this.data.DangerousGetSingleMemory().Pin();
        this.kernels = new ResizeKernel[destinationLength];
-        this.tempValues = new double[this.MaxDiameter];
+        this.tempValues = new float[this.MaxDiameter];
    }

    /// <summary>
@ -155,23 +165,23 @@ internal partial class ResizeKernelMap : IDisposable
        bool hasAtLeast2Periods = 2 * (cornerInterval + period) < destinationSize;

        ResizeKernelMap result = hasAtLeast2Periods
-                                     ? new PeriodicKernelMap(
-                                         memoryAllocator,
-                                         sourceSize,
-                                         destinationSize,
-                                         ratio,
-                                         scale,
-                                         radius,
-                                         period,
-                                         cornerInterval)
-                                     : new ResizeKernelMap(
-                                         memoryAllocator,
-                                         sourceSize,
-                                         destinationSize,
-                                         destinationSize,
-                                         ratio,
-                                         scale,
-                                         radius);
+        ? new PeriodicKernelMap(
+            memoryAllocator,
+            sourceSize,
+            destinationSize,
+            ratio,
+            scale,
+            radius,
+            period,
+            cornerInterval)
+        : new ResizeKernelMap(
+            memoryAllocator,
+            sourceSize,
+            destinationSize,
+            destinationSize,
+            ratio,
+            scale,
+            radius);

        result.Initialize(in sampler);

@ -198,7 +208,8 @@ internal partial class ResizeKernelMap : IDisposable
    private ResizeKernel BuildKernel<TResampler>(in TResampler sampler, int destRowIndex, int dataRowIndex)
        where TResampler : struct, IResampler
    {
-        double center = ((destRowIndex + .5) * this.ratio) - .5;
+        float center = (float)(((destRowIndex + .5) * this.ratio) - .5);
+        float scale = (float)this.scale;

        // Keep inside bounds.
        int left = (int)TolerantMath.Ceiling(center - this.radius);
@ -214,30 +225,25 @@ internal partial class ResizeKernelMap : IDisposable
        }

        ResizeKernel kernel = this.CreateKernel(dataRowIndex, left, right);
-
-        Span<double> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
-        double sum = 0;
+        Span<float> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
+        ref float kernelStart = ref MemoryMarshal.GetReference(kernelValues);
+        float sum = 0;

        for (int j = left; j <= right; j++)
        {
-            double value = sampler.GetValue((float)((j - center) / this.scale));
+            float value = sampler.GetValue((j - center) / scale);
            sum += value;
-
-            kernelValues[j - left] = value;
+            kernelStart = value;
+            kernelStart = ref Unsafe.Add(ref kernelStart, 1);
        }

        // Normalize, best to do it here rather than in the pixel loop later on.
        if (sum > 0)
        {
-            for (int j = 0; j < kernel.Length; j++)
-            {
-                // weights[w] = weights[w] / sum:
-                ref double kRef = ref kernelValues[j];
-                kRef /= sum;
-            }
+            Numerics.Normalize(kernelValues, sum);
        }

-        kernel.Fill(kernelValues);
+        kernel.FillOrCopyAndExpand(kernelValues);

        return kernel;
    }
--- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs
@ -16,9 +16,7 @@ public partial class ResizeKernelMapTests
        private readonly ReferenceKernel[] kernels;

        public ReferenceKernelMap(ReferenceKernel[] kernels)
-        {
-            this.kernels = kernels;
-        }
+            => this.kernels = kernels;

        public int DestinationSize => this.kernels.Length;

@ -28,22 +26,23 @@ public partial class ResizeKernelMapTests
            where TResampler : struct, IResampler
        {
            double ratio = (double)sourceSize / destinationSize;
-            double scale = ratio;
+            double scaleD = ratio;

-            if (scale < 1F)
+            if (scaleD < 1)
            {
-                scale = 1F;
+                scaleD = 1;
            }

            TolerantMath tolerantMath = TolerantMath.Default;

-            double radius = tolerantMath.Ceiling(scale * sampler.Radius);
+            double radius = tolerantMath.Ceiling(scaleD * sampler.Radius);

-            var result = new List<ReferenceKernel>();
+            List<ReferenceKernel> result = [];

+            float scale = (float)scaleD;
            for (int i = 0; i < destinationSize; i++)
            {
-                double center = ((i + .5) * ratio) - .5;
+                float center = (float)(((i + .5) * ratio) - .5);

                // Keep inside bounds.
                int left = (int)tolerantMath.Ceiling(center - radius);
@ -58,15 +57,14 @@ public partial class ResizeKernelMapTests
                    right = sourceSize - 1;
                }

-                double sum = 0;
+                float sum = 0;

-                double[] values = new double[right - left + 1];
+                float[] values = new float[right - left + 1];

                for (int j = left; j <= right; j++)
                {
-                    double weight = sampler.GetValue((float)((j - center) / scale));
+                    float weight = sampler.GetValue((j - center) / scale);
                    sum += weight;
-
                    values[j - left] = weight;
                }

@ -78,16 +76,14 @@ public partial class ResizeKernelMapTests
                    }
                }

-                float[] floatVals = values.Select(v => (float)v).ToArray();
-
-                result.Add(new ReferenceKernel(left, floatVals));
+                result.Add(new ReferenceKernel(left, values));
            }

-            return new ReferenceKernelMap(result.ToArray());
+            return new ReferenceKernelMap([.. result]);
        }
    }

-    internal struct ReferenceKernel
+    internal readonly struct ReferenceKernel
    {
        public ReferenceKernel(int left, float[] values)
        {
@ -102,8 +98,6 @@ public partial class ResizeKernelMapTests
        public int Length => this.Values.Length;

        public static implicit operator ReferenceKernel(ResizeKernel orig)
-        {
-            return new ReferenceKernel(orig.StartIndex, orig.Values.ToArray());
-        }
+            => new(orig.StartIndex, orig.Values.ToArray());
    }
 }
--- a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
@ -1,6 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.

+using System.Runtime.Intrinsics;
 using System.Text;
 using SixLabors.ImageSharp.Processing;
 using SixLabors.ImageSharp.Processing.Processors.Transforms;
@ -124,7 +125,6 @@ public partial class ResizeKernelMapTests
        this.Output.WriteLine($"Expected KernelMap:\n{PrintKernelMap(referenceMap)}\n");
        this.Output.WriteLine($"Actual KernelMap:\n{PrintKernelMap(kernelMap)}\n");
 #endif
-        var comparer = new ApproximateFloatComparer(1e-6f);

        for (int i = 0; i < kernelMap.DestinationLength; i++)
        {
@ -139,7 +139,29 @@ public partial class ResizeKernelMapTests
                referenceKernel.Left == kernel.StartIndex,
                $"referenceKernel.Left != kernel.Left: {referenceKernel.Left} != {kernel.StartIndex}");
            float[] expectedValues = referenceKernel.Values;
-            Span<float> actualValues = kernel.Values;
+            Span<float> actualValues;
+
+            ApproximateFloatComparer comparer;
+            if (Vector256.IsHardwareAccelerated)
+            {
+                comparer = new ApproximateFloatComparer(1e-4f);
+
+                Assert.Equal(expectedValues.Length, kernel.Values.Length / 4);
+
+                int actualLength = referenceKernel.Length / 4;
+
+                actualValues = new float[expectedValues.Length];
+
+                for (int j = 0; j < expectedValues.Length; j++)
+                {
+                    actualValues[j] = kernel.Values[j * 4];
+                }
+            }
+            else
+            {
+                comparer = new ApproximateFloatComparer(1e-6f);
+                actualValues = kernel.Values;
+            }

            Assert.Equal(expectedValues.Length, actualValues.Length);