Optimized lvi calculation via lzcnt intrinsic

5 years ago · 937a8689ba
1 changed files with 18 additions and 20 deletions
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@ -3,6 +3,7 @@

 using System;
 using System.IO;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
@ -441,7 +442,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
            // Lzcnt would return 32 for input value of 0 - no need to check that with branching
            // Fallback code if Lzcnt is not supported still use if-check
            // But most modern CPUs support this instruction so this should not be a problem
-            return 32 - System.Numerics.BitOperations.LeadingZeroCount(value);
+            return 32 - BitOperations.LeadingZeroCount(value);
 #else
            // Ideally:
            // if 0 - return 0 in this case
@ -458,13 +459,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        }

        /// <summary>
-        /// Returns index of the last non-zero element in given mcu block.
-        /// If all values of the mcu block are zero, this method might return different results depending on the runtime and hardware support.
-        /// This is jpeg mcu specific code, mcu[0] stores a dc value which will be encoded outside of the loop.
-        /// This method is guaranteed to return either -1 or 0 if all elements are zero.
+        /// Returns index of the last non-zero element in given matrix.
        /// </summary>
        /// <remarks>
-        /// This is an internal operation supposed to be used only in <see cref="HuffmanScanEncoder"/> class for jpeg encoding.
+        /// Returns 0 for all-zero matrix by convention.
        /// </remarks>
        /// <param name="mcu">Mcu block.</param>
        /// <returns>Index of the last non-zero element.</returns>
@ -484,24 +482,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                {
                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref mcuStride, i)), zero8).AsByte());

-                    // we do not know for sure if this stride contain all non-zero elements or if it has some trailing zeros
                    if (areEqual != equalityMask)
                    {
-                        // last index in the stride, we go from the end to the start of the stride
-                        int startIndex = i * 8;
-                        int index = startIndex + 7;
-                        ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref mcu);
-                        while (index >= startIndex && (int)Unsafe.Add(ref elemRef, index) == 0)
-                        {
-                            index--;
-                        }
-
-                        // this implementation will return -1 if all ac components are zero and dc are zero
-                        return index;
+                        // Each 4 bits represents comparison operation for each 4-byte element in input vectors
+                        // LSB represents first element in the stride
+                        // MSB represents last element in the stride
+                        // lzcnt operation would calculate number of zero numbers at the end
+
+                        // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements
+                        // So we need to invert it
+                        int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual);
+
+                        // As input number is represented by 4 bits in the mask, we need to divide lzcnt result by 4
+                        // to get the exact number of zero elements in the stride
+                        int strideRelativeIndex = 7 - (lzcnt / 4);
+                        return (i * 8) + strideRelativeIndex;
                    }
                }

-                return -1;
+                return 0;
            }
            else
 #endif
@ -514,7 +513,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                    index--;
                }

-                // this implementation will return 0 if all ac components and dc are zero
                return index;
            }
        }