Merge branch 'master' into af/UniformUnmanagedMemoryPoolMemoryAllocator-02

# Conflicts: # src/ImageSharp/Memory/Allocators/IManagedByteBuffer.cs # src/ImageSharp/Memory/Allocators/Internals/BasicByteBuffer.cs # src/ImageSharp/Memory/Allocators/Internals/ManagedBufferBase.cs # tests/ImageSharp.Tests.ProfilingSandbox/LoadResizeSaveParallelMemoryStress.cs # tests/ImageSharp.Tests/Formats/Tiff/Compression/PackBitsTiffCompressionTests.cs # tests/ImageSharp.Tests/Image/ImageFrameTests.cs # tests/ImageSharp.Tests/Image/ImageTests.cs # tests/ImageSharp.Tests/Memory/Allocators/ArrayPoolMemoryAllocatorTests.cs # tests/ImageSharp.Tests/Memory/Allocators/SimpleGcMemoryAllocatorTests.cs # tests/ImageSharp.Tests/TestUtilities/ImageProviders/TestImageProvider.cs # tests/ImageSharp.Tests/TestUtilities/TestImageExtensions.cs
4 years ago · 017ee4049c
599 changed files with 7026 additions and 3748 deletions
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@ -1,19 +1,37 @@
 name: Build

 on:
-    push:
-        branches:
-            - master
-        tags:
-            - "v*"
-    pull_request:
-        branches:
-            - master
+   push:
+       branches:
+           - master
+       tags:
+           - "v*"
+   pull_request:
+       branches:
+          - master
 jobs:
    Build:
        strategy:
            matrix:
                options:
+                    - os: ubuntu-latest
+                      framework: net6.0
+                      sdk: 6.0.x
+                      sdk-preview: true
+                      runtime: -x64
+                      codecov: false
+                    - os: macos-latest
+                      framework: net6.0
+                      sdk: 6.0.x
+                      sdk-preview: true
+                      runtime: -x64
+                      codecov: false
+                    - os: windows-latest
+                      framework: net6.0
+                      sdk: 6.0.x
+                      sdk-preview: true
+                      runtime: -x64
+                      codecov: false
                    - os: ubuntu-latest
                      framework: net5.0
                      runtime: -x64
@ -52,37 +70,38 @@ jobs:
                      codecov: false

        runs-on: ${{matrix.options.os}}
-        if: "!contains(github.event.head_commit.message, '[skip ci]')"

        steps:
-            - uses: actions/checkout@v2
+            - name: Git Config
+              shell: bash
+              run: |
+                  git config --global core.autocrlf false
+                  git config --global core.longpaths true
+
+            - name: Git Checkout
+              uses: actions/checkout@v2
+              with:
+                fetch-depth: 0
+                submodules: recursive
            
            # See https://github.com/actions/checkout/issues/165#issuecomment-657673315
-            - name: Create LFS file list
+            - name: Git Create LFS FileList
              run: git lfs ls-files -l | cut -d' ' -f1 | sort > .lfs-assets-id

-            - name: Restore LFS cache
+            - name: Git Setup LFS Cache
              uses: actions/cache@v2
              id: lfs-cache
              with:
                path: .git/lfs
                key: ${{ runner.os }}-lfs-${{ hashFiles('.lfs-assets-id') }}-v1

-            - name: Git LFS Pull
+            - name: Git Pull LFS
              run: git lfs pull

-            - name: Install NuGet
+            - name: NuGet Install
              uses: NuGet/setup-nuget@v1

-            - name: Setup Git
-              shell: bash
-              run: |
-                  git config --global core.autocrlf false
-                  git config --global core.longpaths true
-                  git fetch --prune --unshallow
-                  git submodule -q update --init --recursive
-
-            - name: Setup NuGet Cache
+            - name: NuGet Setup Cache
              uses: actions/cache@v2
              id: nuget-cache
              with:
@ -90,60 +109,94 @@ jobs:
                  key: ${{ runner.os }}-nuget-${{ hashFiles('**/*.csproj', '**/*.props', '**/*.targets') }}
                  restore-keys: ${{ runner.os }}-nuget-

-            - name: Build
+            - name: DotNet Setup Preview
+              if: ${{ matrix.options.sdk-preview == true }}
+              uses: actions/setup-dotnet@v1
+              with:
+                dotnet-version: ${{ matrix.options.sdk }}
+                include-prerelease: true
+
+            - name: DotNet Build
+              if: ${{ matrix.options.sdk-preview != true }}
              shell: pwsh
              run: ./ci-build.ps1 "${{matrix.options.framework}}"
              env:
                SIXLABORS_TESTING: True

-            - name: Test
+            - name: DotNet Build Preview
+              if: ${{ matrix.options.sdk-preview == true }}
+              shell: pwsh
+              run: ./ci-build.ps1 "${{matrix.options.framework}}"
+              env:
+                SIXLABORS_TESTING_PREVIEW: True
+
+            - name: DotNet Test
+              if: ${{ matrix.options.sdk-preview != true }}
              shell: pwsh
              run: ./ci-test.ps1 "${{matrix.options.os}}" "${{matrix.options.framework}}" "${{matrix.options.runtime}}" "${{matrix.options.codecov}}"
              env:
-                  SIXLABORS_TESTING: True
-                  XUNIT_PATH: .\tests\ImageSharp.Tests # Required for xunit
+                SIXLABORS_TESTING: True
+                XUNIT_PATH: .\tests\ImageSharp.Tests # Required for xunit
+
+            - name: DotNet Test Preview
+              if: ${{ matrix.options.sdk-preview == true }}
+              shell: pwsh
+              run: ./ci-test.ps1 "${{matrix.options.os}}" "${{matrix.options.framework}}" "${{matrix.options.runtime}}" "${{matrix.options.codecov}}"
+              env:
+                SIXLABORS_TESTING_PREVIEW: True
+                XUNIT_PATH: .\tests\ImageSharp.Tests # Required for xunit

            - name: Export Failed Output
              uses: actions/upload-artifact@v2
              if: failure()
              with:
-                  name: actual_output_${{ runner.os }}_${{ matrix.options.framework }}${{ matrix.options.runtime }}.zip
-                  path: tests/Images/ActualOutput/
+                name: actual_output_${{ runner.os }}_${{ matrix.options.framework }}${{ matrix.options.runtime }}.zip
+                path: tests/Images/ActualOutput/

-            - name: Update Codecov
+            - name: Codecov Update
              uses: codecov/codecov-action@v1
              if: matrix.options.codecov == true && startsWith(github.repository, 'SixLabors')
              with:
-                  flags: unittests
+                flags: unittests

    Publish:
        needs: [Build]

-        runs-on: windows-latest
+        runs-on: ubuntu-latest

        if: (github.event_name == 'push')

        steps:
-            - uses: actions/checkout@v2
-
-            - name: Install NuGet
-              uses: NuGet/setup-nuget@v1
-
-            - name: Setup Git
+            - name: Git Config
              shell: bash
              run: |
                  git config --global core.autocrlf false
                  git config --global core.longpaths true
-                  git fetch --prune --unshallow
-                  git submodule -q update --init --recursive

-            - name: Pack
+            - name: Git Checkout
+              uses: actions/checkout@v2
+              with:
+                fetch-depth: 0
+                submodules: recursive
+
+            - name: NuGet Install
+              uses: NuGet/setup-nuget@v1
+
+            - name: NuGet Setup Cache
+              uses: actions/cache@v2
+              id: nuget-cache
+              with:
+                  path: ~/.nuget
+                  key: ${{ runner.os }}-nuget-${{ hashFiles('**/*.csproj', '**/*.props', '**/*.targets') }}
+                  restore-keys: ${{ runner.os }}-nuget-
+
+            - name: DotNet Pack
              shell: pwsh
              run: ./ci-pack.ps1

-            - name: Publish to MyGet
+            - name: MyGet Publish
              shell: pwsh
              run: |
-                  nuget.exe push .\artifacts\*.nupkg ${{secrets.MYGET_TOKEN}} -Source https://www.myget.org/F/sixlabors/api/v2/package
-                  nuget.exe push .\artifacts\*.snupkg ${{secrets.MYGET_TOKEN}} -Source https://www.myget.org/F/sixlabors/api/v3/index.json
+                  dotnet nuget push .\artifacts\*.nupkg -k ${{secrets.MYGET_TOKEN}} -s https://www.myget.org/F/sixlabors/api/v2/package
+                  dotnet nuget push .\artifacts\*.snupkg -k ${{secrets.MYGET_TOKEN}} -s https://www.myget.org/F/sixlabors/api/v3/index.json
              # TODO: If github.ref starts with 'refs/tags' then it was tag push and we can optionally push out package to nuget.org
--- a/.gitignore
+++ b/.gitignore
@ -223,3 +223,7 @@ artifacts/
 **/Images/ReferenceOutput
 **/Images/Input/MemoryStress
 .DS_Store
+
+#lfs
+hooks/**
+lfs/**
--- a/Directory.Build.props
+++ b/Directory.Build.props
@ -18,6 +18,11 @@
  <!-- Import the shared global .props file -->
  <Import Project="$(MSBuildThisFileDirectory)shared-infrastructure\msbuild\props\SixLabors.Global.props" />

+  <PropertyGroup Condition="$(SIXLABORS_TESTING_PREVIEW) == true">
+    <!-- Workaround various issues bound to implicit language features. -->
+    <LangVersion>preview</LangVersion>
+  </PropertyGroup>
+
  <!--
  Ensure all custom build configurations based upon "Release" are optimized.
  This is easier than setting each project individually.
--- a/ImageSharp.sln
+++ b/ImageSharp.sln
@ -403,6 +403,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "issues", "issues", "{670DD4
 		tests\Images\Input\Png\issues\Issue_1127.png = tests\Images\Input\Png\issues\Issue_1127.png
 		tests\Images\Input\Png\issues\Issue_1177_1.png = tests\Images\Input\Png\issues\Issue_1177_1.png
 		tests\Images\Input\Png\issues\Issue_1177_2.png = tests\Images\Input\Png\issues\Issue_1177_2.png
+		tests\Images\Input\Png\issues\Issue_1765_Net6DeflateStreamRead.png = tests\Images\Input\Png\issues\Issue_1765_Net6DeflateStreamRead.png
 		tests\Images\Input\Png\issues\Issue_410.png = tests\Images\Input\Png\issues\Issue_410.png
 		tests\Images\Input\Png\issues\Issue_935.png = tests\Images\Input\Png\issues\Issue_935.png
 	EndProjectSection
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 9b94ebc4be9b7a8d7620c257e6ee485455973332
+Subproject commit a042aba176cdb840d800c6ed4cfe41a54fb7b1e3
--- a/src/ImageSharp/Advanced/ParallelExecutionSettings.cs
+++ b/src/ImageSharp/Advanced/ParallelExecutionSettings.cs
@ -3,7 +3,6 @@

 using System;
 using System.Threading.Tasks;
-
 using SixLabors.ImageSharp.Memory;

 namespace SixLabors.ImageSharp.Advanced
--- a/src/ImageSharp/Color/Color.Conversions.cs
+++ b/src/ImageSharp/Color/Color.Conversions.cs
@ -3,7 +3,6 @@

 using System.Numerics;
 using System.Runtime.CompilerServices;
-
 using SixLabors.ImageSharp.PixelFormats;

 namespace SixLabors.ImageSharp
@ -95,4 +94,4 @@ namespace SixLabors.ImageSharp
        [MethodImpl(InliningOptions.ShortMethod)]
        internal Vector4 ToVector4() => this.data.ToVector4();
    }
-}
+}
--- a/src/ImageSharp/Color/Color.WebSafePalette.cs
+++ b/src/ImageSharp/Color/Color.WebSafePalette.cs
@ -163,4 +163,4 @@ namespace SixLabors.ImageSharp
            YellowGreen
        };
    }
-}
+}
--- a/src/ImageSharp/Color/Color.cs
+++ b/src/ImageSharp/Color/Color.cs
@ -5,7 +5,6 @@ using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-
 using SixLabors.ImageSharp.PixelFormats;

 namespace SixLabors.ImageSharp
--- a/src/ImageSharp/ColorSpaces/CieLab.cs
+++ b/src/ImageSharp/ColorSpaces/CieLab.cs
@ -136,4 +136,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
                && this.WhitePoint.Equals(other.WhitePoint);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/CieLch.cs
+++ b/src/ImageSharp/ColorSpaces/CieLch.cs
@ -162,4 +162,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
            return result;
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/CieLchuv.cs
+++ b/src/ImageSharp/ColorSpaces/CieLchuv.cs
@ -157,4 +157,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
            return result;
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/CieLuv.cs
+++ b/src/ImageSharp/ColorSpaces/CieLuv.cs
@ -137,4 +137,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
                && this.WhitePoint.Equals(other.WhitePoint);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/CieXyy.cs
+++ b/src/ImageSharp/ColorSpaces/CieXyy.cs
@ -100,4 +100,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
                && this.Yl.Equals(other.Yl);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/CieXyz.cs
+++ b/src/ImageSharp/ColorSpaces/CieXyz.cs
@ -103,4 +103,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
                && this.Z.Equals(other.Z);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Cmyk.cs
+++ b/src/ImageSharp/ColorSpaces/Cmyk.cs
@ -108,4 +108,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
                && this.K.Equals(other.K);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Companding/GammaCompanding.cs
+++ b/src/ImageSharp/ColorSpaces/Companding/GammaCompanding.cs
@ -33,4 +33,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Companding
        [MethodImpl(InliningOptions.ShortMethod)]
        public static float Compress(float channel, float gamma) => MathF.Pow(channel, 1 / gamma);
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Companding/Rec2020Companding.cs
+++ b/src/ImageSharp/ColorSpaces/Companding/Rec2020Companding.cs
@ -38,4 +38,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Companding
        public static float Compress(float channel)
            => channel < Beta ? 4.5F * channel : (Alpha * MathF.Pow(channel, 0.45F)) - AlphaMinusOne;
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Companding/Rec709Companding.cs
+++ b/src/ImageSharp/ColorSpaces/Companding/Rec709Companding.cs
@ -34,4 +34,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Companding
        public static float Compress(float channel)
            => channel < 0.018F ? 4.5F * channel : (1.099F * MathF.Pow(channel, 0.45F)) - 0.099F;
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/CieConstants.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/CieConstants.cs
@ -19,4 +19,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
        /// </summary>
        public const float Kappa = 903.2963F;
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/ColorSpaceConverter.HunterLab.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/ColorSpaceConverter.HunterLab.cs
@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

 using System;
@ -429,4 +429,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            return this.ToHunterLab(xyzColor);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/ColorSpaceConverter.Lms.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/ColorSpaceConverter.Lms.cs
@ -424,4 +424,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            return this.ToLms(xyzColor);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CIeLchToCieLabConverter.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CIeLchToCieLabConverter.cs
@ -30,4 +30,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            return new CieLab(l, a, b, input.WhitePoint);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzAndCieXyyConverter.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzAndCieXyyConverter.cs
@ -51,4 +51,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            return new CieXyz(x, y, z);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzAndHunterLabConverterBase.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzAndHunterLabConverterBase.cs
@ -42,4 +42,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            return 100F * (70F / 218.11F) * (whitePoint.Y + whitePoint.Z);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzAndLmsConverter.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzAndLmsConverter.cs
@ -67,4 +67,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            return new CieXyz(vector);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzToCieLabConverter.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzToCieLabConverter.cs
@ -54,4 +54,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            return new CieLab(l, a, b, this.LabWhitePoint);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzToCieLuvConverter.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzToCieLuvConverter.cs
@ -85,4 +85,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
        private static float ComputeVp(in CieXyz input)
           => (9 * input.Y) / (input.X + (15 * input.Y) + (3 * input.Z));
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzToHunterLabConverter.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzToHunterLabConverter.cs
@ -64,4 +64,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            return new HunterLab(l, a, b, this.HunterLabWhitePoint);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzToLinearRgbConverter.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CieXyzToLinearRgbConverter.cs
@ -53,4 +53,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            return new LinearRgb(vector, this.TargetWorkingSpace);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CmykAndRgbConverter.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/CmykAndRgbConverter.cs
@ -48,4 +48,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            return new Cmyk(cmy.X, cmy.Y, cmy.Z, k.X);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/YCbCrAndRgbConverter.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/Implementation/Converters/YCbCrAndRgbConverter.cs
@ -54,4 +54,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            return new YCbCr(y, cb, cr);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Conversion/Implementation/IChromaticAdaptation.cs
+++ b/src/ImageSharp/ColorSpaces/Conversion/Implementation/IChromaticAdaptation.cs
@ -36,4 +36,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Conversion
            CieXyz sourceWhitePoint,
            in CieXyz destinationWhitePoint);
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Hsl.cs
+++ b/src/ImageSharp/ColorSpaces/Hsl.cs
@ -101,4 +101,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
                && this.L.Equals(other.L);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Hsv.cs
+++ b/src/ImageSharp/ColorSpaces/Hsv.cs
@ -99,4 +99,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
                && this.V.Equals(other.V);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/HunterLab.cs
+++ b/src/ImageSharp/ColorSpaces/HunterLab.cs
@ -135,4 +135,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
                && this.WhitePoint.Equals(other.WhitePoint);
        }
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Illuminants.cs
+++ b/src/ImageSharp/ColorSpaces/Illuminants.cs
@ -69,4 +69,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
        /// </summary>
        public static readonly CieXyz F11 = new CieXyz(1.00962F, 1F, 0.64350F);
    }
-}
+}
--- a/src/ImageSharp/ColorSpaces/Lms.cs
+++ b/src/ImageSharp/ColorSpaces/Lms.cs
@ -104,4 +104,4 @@ namespace SixLabors.ImageSharp.ColorSpaces
                && this.S.Equals(other.S);
        }
    }
-}
+}
--- a/src/ImageSharp/Common/Constants.cs
+++ b/src/ImageSharp/Common/Constants.cs
@ -18,4 +18,4 @@ namespace SixLabors.ImageSharp
        /// </summary>
        public static readonly float EpsilonSquared = Epsilon * Epsilon;
    }
-}
+}
--- a/src/ImageSharp/Common/Helpers/InliningOptions.cs
+++ b/src/ImageSharp/Common/Helpers/InliningOptions.cs
@ -12,6 +12,10 @@ namespace SixLabors.ImageSharp
    /// </summary>
    internal static class InliningOptions
    {
+        /// <summary>
+        /// <see cref="MethodImplOptions.AggressiveInlining"/> regardless of the build conditions.
+        /// </summary>
+        public const MethodImplOptions AlwaysInline = MethodImplOptions.AggressiveInlining;
 #if PROFILING
        public const MethodImplOptions HotPath = MethodImplOptions.NoInlining;
        public const MethodImplOptions ShortMethod = MethodImplOptions.NoInlining;
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@ -537,7 +537,7 @@ namespace SixLabors.ImageSharp
            /// <param name="vm0">The first vector to multiply.</param>
            /// <param name="vm1">The second vector to multiply.</param>
            /// <returns>The <see cref="Vector256{T}"/>.</returns>
-            [MethodImpl(InliningOptions.ShortMethod)]
+            [MethodImpl(InliningOptions.AlwaysInline)]
            public static Vector256<float> MultiplyAdd(
                in Vector256<float> va,
                in Vector256<float> vm0,
@ -622,90 +622,89 @@ namespace SixLabors.ImageSharp
                ReadOnlySpan<byte> source,
                Span<float> dest)
            {
-                if (Avx2.IsSupported)
+                fixed (byte* sourceBase = source)
                {
-                    VerifySpanInput(source, dest, Vector256<byte>.Count);
-
-                    int n = dest.Length / Vector256<byte>.Count;
+                    if (Avx2.IsSupported)
+                    {
+                        VerifySpanInput(source, dest, Vector256<byte>.Count);

-                    byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+                        int n = dest.Length / Vector256<byte>.Count;

-                    ref Vector256<float> destBase =
-                        ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest));
+                        ref Vector256<float> destBase =
+                            ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest));

-                    var scale = Vector256.Create(1 / (float)byte.MaxValue);
+                        var scale = Vector256.Create(1 / (float)byte.MaxValue);

-                    for (int i = 0; i < n; i++)
-                    {
-                        int si = Vector256<byte>.Count * i;
-                        Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si);
-                        Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count);
-                        Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2));
-                        Vector256<int> i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 3));
-
-                        Vector256<float> f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));
-                        Vector256<float> f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));
-                        Vector256<float> f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));
-                        Vector256<float> f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));
-
-                        ref Vector256<float> d = ref Unsafe.Add(ref destBase, i * 4);
-
-                        d = f0;
-                        Unsafe.Add(ref d, 1) = f1;
-                        Unsafe.Add(ref d, 2) = f2;
-                        Unsafe.Add(ref d, 3) = f3;
+                        for (int i = 0; i < n; i++)
+                        {
+                            int si = Vector256<byte>.Count * i;
+                            Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si);
+                            Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count);
+                            Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2));
+                            Vector256<int> i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 3));
+
+                            Vector256<float> f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));
+                            Vector256<float> f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));
+                            Vector256<float> f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));
+                            Vector256<float> f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));
+
+                            ref Vector256<float> d = ref Unsafe.Add(ref destBase, i * 4);
+
+                            d = f0;
+                            Unsafe.Add(ref d, 1) = f1;
+                            Unsafe.Add(ref d, 2) = f2;
+                            Unsafe.Add(ref d, 3) = f3;
+                        }
                    }
-                }
-                else
-                {
-                    // Sse
-                    VerifySpanInput(source, dest, Vector128<byte>.Count);
-
-                    int n = dest.Length / Vector128<byte>.Count;
-
-                    byte* sourceBase = (byte*)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source));
+                    else
+                    {
+                        // Sse
+                        VerifySpanInput(source, dest, Vector128<byte>.Count);

-                    ref Vector128<float> destBase =
-                        ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest));
+                        int n = dest.Length / Vector128<byte>.Count;

-                    var scale = Vector128.Create(1 / (float)byte.MaxValue);
-                    Vector128<byte> zero = Vector128<byte>.Zero;
+                        ref Vector128<float> destBase =
+                            ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest));

-                    for (int i = 0; i < n; i++)
-                    {
-                        int si = Vector128<byte>.Count * i;
+                        var scale = Vector128.Create(1 / (float)byte.MaxValue);
+                        Vector128<byte> zero = Vector128<byte>.Zero;

-                        Vector128<int> i0, i1, i2, i3;
-                        if (Sse41.IsSupported)
-                        {
-                            i0 = Sse41.ConvertToVector128Int32(sourceBase + si);
-                            i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128<int>.Count);
-                            i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 2));
-                            i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 3));
-                        }
-                        else
+                        for (int i = 0; i < n; i++)
                        {
-                            Vector128<byte> b = Sse2.LoadVector128(sourceBase + si);
-                            Vector128<short> s0 = Sse2.UnpackLow(b, zero).AsInt16();
-                            Vector128<short> s1 = Sse2.UnpackHigh(b, zero).AsInt16();
-
-                            i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32();
-                            i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32();
-                            i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32();
-                            i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32();
+                            int si = Vector128<byte>.Count * i;
+
+                            Vector128<int> i0, i1, i2, i3;
+                            if (Sse41.IsSupported)
+                            {
+                                i0 = Sse41.ConvertToVector128Int32(sourceBase + si);
+                                i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128<int>.Count);
+                                i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 2));
+                                i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 3));
+                            }
+                            else
+                            {
+                                Vector128<byte> b = Sse2.LoadVector128(sourceBase + si);
+                                Vector128<short> s0 = Sse2.UnpackLow(b, zero).AsInt16();
+                                Vector128<short> s1 = Sse2.UnpackHigh(b, zero).AsInt16();
+
+                                i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32();
+                                i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32();
+                                i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32();
+                                i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32();
+                            }
+
+                            Vector128<float> f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0));
+                            Vector128<float> f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1));
+                            Vector128<float> f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2));
+                            Vector128<float> f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3));
+
+                            ref Vector128<float> d = ref Unsafe.Add(ref destBase, i * 4);
+
+                            d = f0;
+                            Unsafe.Add(ref d, 1) = f1;
+                            Unsafe.Add(ref d, 2) = f2;
+                            Unsafe.Add(ref d, 3) = f3;
                        }
-
-                        Vector128<float> f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0));
-                        Vector128<float> f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1));
-                        Vector128<float> f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2));
-                        Vector128<float> f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3));
-
-                        ref Vector128<float> d = ref Unsafe.Add(ref destBase, i * 4);
-
-                        d = f0;
-                        Unsafe.Add(ref d, 1) = f1;
-                        Unsafe.Add(ref d, 2) = f2;
-                        Unsafe.Add(ref d, 3) = f3;
                    }
                }
            }
--- a/src/ImageSharp/Common/Helpers/SimdUtils.Pack.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.Pack.cs
@ -5,9 +5,7 @@ using System;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.PixelFormats;
-
 #if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 #endif

@ -203,4 +201,4 @@ namespace SixLabors.ImageSharp
            }
        }
    }
-}
+}
--- a/src/ImageSharp/Compression/Zlib/Adler32.cs
+++ b/src/ImageSharp/Compression/Zlib/Adler32.cs
@ -91,115 +91,117 @@ namespace SixLabors.ImageSharp.Compression.Zlib

            int index = 0;
            fixed (byte* bufferPtr = buffer)
-            fixed (byte* tapPtr = Tap1Tap2)
            {
-                index += (int)blocks * BLOCK_SIZE;
-                var localBufferPtr = bufferPtr;
-
-                // _mm_setr_epi8 on x86
-                Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
-                Vector128<sbyte> tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10));
-                Vector128<byte> zero = Vector128<byte>.Zero;
-                var ones = Vector128.Create((short)1);
-
-                while (blocks > 0)
+                fixed (byte* tapPtr = Tap1Tap2)
                {
-                    uint n = NMAX / BLOCK_SIZE;  /* The NMAX constraint. */
-                    if (n > blocks)
-                    {
-                        n = blocks;
-                    }
+                    index += (int)blocks * BLOCK_SIZE;
+                    var localBufferPtr = bufferPtr;

-                    blocks -= n;
+                    // _mm_setr_epi8 on x86
+                    Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
+                    Vector128<sbyte> tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10));
+                    Vector128<byte> zero = Vector128<byte>.Zero;
+                    var ones = Vector128.Create((short)1);

-                    // Process n blocks of data. At most NMAX data bytes can be
-                    // processed before s2 must be reduced modulo BASE.
-                    Vector128<uint> v_ps = Vector128.CreateScalar(s1 * n);
-                    Vector128<uint> v_s2 = Vector128.CreateScalar(s2);
-                    Vector128<uint> v_s1 = Vector128<uint>.Zero;
-
-                    do
+                    while (blocks > 0)
                    {
-                        // Load 32 input bytes.
-                        Vector128<byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
-                        Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);
+                        uint n = NMAX / BLOCK_SIZE;  /* The NMAX constraint. */
+                        if (n > blocks)
+                        {
+                            n = blocks;
+                        }

-                        // Add previous block byte sum to v_ps.
-                        v_ps = Sse2.Add(v_ps, v_s1);
+                        blocks -= n;

-                        // Horizontally add the bytes for s1, multiply-adds the
-                        // bytes by [ 32, 31, 30, ... ] for s2.
-                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
-                        Vector128<short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
-                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());
+                        // Process n blocks of data. At most NMAX data bytes can be
+                        // processed before s2 must be reduced modulo BASE.
+                        Vector128<uint> v_ps = Vector128.CreateScalar(s1 * n);
+                        Vector128<uint> v_s2 = Vector128.CreateScalar(s2);
+                        Vector128<uint> v_s1 = Vector128<uint>.Zero;

-                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
-                        Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
-                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());
+                        do
+                        {
+                            // Load 32 input bytes.
+                            Vector128<byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
+                            Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);

-                        localBufferPtr += BLOCK_SIZE;
-                    }
-                    while (--n > 0);
+                            // Add previous block byte sum to v_ps.
+                            v_ps = Sse2.Add(v_ps, v_s1);

-                    v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));
+                            // Horizontally add the bytes for s1, multiply-adds the
+                            // bytes by [ 32, 31, 30, ... ] for s2.
+                            v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
+                            Vector128<short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
+                            v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());

-                    // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
-                    const byte S2301 = 0b1011_0001;  // A B C D -> B A D C
-                    const byte S1032 = 0b0100_1110;  // A B C D -> C D A B
+                            v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
+                            Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
+                            v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());

-                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));
+                            localBufferPtr += BLOCK_SIZE;
+                        }
+                        while (--n > 0);

-                    s1 += v_s1.ToScalar();
+                        v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

-                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
-                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));
+                        // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
+                        const byte S2301 = 0b1011_0001;  // A B C D -> B A D C
+                        const byte S1032 = 0b0100_1110;  // A B C D -> C D A B

-                    s2 = v_s2.ToScalar();
+                        v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));

-                    // Reduce.
-                    s1 %= BASE;
-                    s2 %= BASE;
-                }
+                        s1 += v_s1.ToScalar();

-                if (length > 0)
-                {
-                    if (length >= 16)
-                    {
-                        s2 += s1 += localBufferPtr[0];
-                        s2 += s1 += localBufferPtr[1];
-                        s2 += s1 += localBufferPtr[2];
-                        s2 += s1 += localBufferPtr[3];
-                        s2 += s1 += localBufferPtr[4];
-                        s2 += s1 += localBufferPtr[5];
-                        s2 += s1 += localBufferPtr[6];
-                        s2 += s1 += localBufferPtr[7];
-                        s2 += s1 += localBufferPtr[8];
-                        s2 += s1 += localBufferPtr[9];
-                        s2 += s1 += localBufferPtr[10];
-                        s2 += s1 += localBufferPtr[11];
-                        s2 += s1 += localBufferPtr[12];
-                        s2 += s1 += localBufferPtr[13];
-                        s2 += s1 += localBufferPtr[14];
-                        s2 += s1 += localBufferPtr[15];
+                        v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
+                        v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));

-                        localBufferPtr += 16;
-                        length -= 16;
-                    }
+                        s2 = v_s2.ToScalar();

-                    while (length-- > 0)
-                    {
-                        s2 += s1 += *localBufferPtr++;
+                        // Reduce.
+                        s1 %= BASE;
+                        s2 %= BASE;
                    }

-                    if (s1 >= BASE)
+                    if (length > 0)
                    {
-                        s1 -= BASE;
+                        if (length >= 16)
+                        {
+                            s2 += s1 += localBufferPtr[0];
+                            s2 += s1 += localBufferPtr[1];
+                            s2 += s1 += localBufferPtr[2];
+                            s2 += s1 += localBufferPtr[3];
+                            s2 += s1 += localBufferPtr[4];
+                            s2 += s1 += localBufferPtr[5];
+                            s2 += s1 += localBufferPtr[6];
+                            s2 += s1 += localBufferPtr[7];
+                            s2 += s1 += localBufferPtr[8];
+                            s2 += s1 += localBufferPtr[9];
+                            s2 += s1 += localBufferPtr[10];
+                            s2 += s1 += localBufferPtr[11];
+                            s2 += s1 += localBufferPtr[12];
+                            s2 += s1 += localBufferPtr[13];
+                            s2 += s1 += localBufferPtr[14];
+                            s2 += s1 += localBufferPtr[15];
+
+                            localBufferPtr += 16;
+                            length -= 16;
+                        }
+
+                        while (length-- > 0)
+                        {
+                            s2 += s1 += *localBufferPtr++;
+                        }
+
+                        if (s1 >= BASE)
+                        {
+                            s1 -= BASE;
+                        }
+
+                        s2 %= BASE;
                    }

-                    s2 %= BASE;
+                    return s1 | (s2 << 16);
                }
-
-                return s1 | (s2 << 16);
            }
        }
 #endif
--- a/src/ImageSharp/Compression/Zlib/Crc32.cs
+++ b/src/ImageSharp/Compression/Zlib/Crc32.cs
@ -83,117 +83,119 @@ namespace SixLabors.ImageSharp.Compression.Zlib
            int length = chunksize;

            fixed (byte* bufferPtr = buffer)
-            fixed (ulong* k05PolyPtr = K05Poly)
            {
-                byte* localBufferPtr = bufferPtr;
-                ulong* localK05PolyPtr = k05PolyPtr;
-
-                // There's at least one block of 64.
-                Vector128<ulong> x1 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00));
-                Vector128<ulong> x2 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10));
-                Vector128<ulong> x3 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20));
-                Vector128<ulong> x4 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30));
-                Vector128<ulong> x5;
-
-                x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64());
-
-                // k1, k2
-                Vector128<ulong> x0 = Sse2.LoadVector128(localK05PolyPtr + 0x0);
-
-                localBufferPtr += 64;
-                length -= 64;
-
-                // Parallel fold blocks of 64, if any.
-                while (length >= 64)
+                fixed (ulong* k05PolyPtr = K05Poly)
                {
-                    x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
-                    Vector128<ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
-                    Vector128<ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00);
-                    Vector128<ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00);
-
-                    x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
-                    x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11);
-                    x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11);
-                    x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11);
+                    byte* localBufferPtr = bufferPtr;
+                    ulong* localK05PolyPtr = k05PolyPtr;

-                    Vector128<ulong> y5 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00));
-                    Vector128<ulong> y6 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10));
-                    Vector128<ulong> y7 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20));
-                    Vector128<ulong> y8 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30));
+                    // There's at least one block of 64.
+                    Vector128<ulong> x1 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00));
+                    Vector128<ulong> x2 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10));
+                    Vector128<ulong> x3 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20));
+                    Vector128<ulong> x4 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30));
+                    Vector128<ulong> x5;

-                    x1 = Sse2.Xor(x1, x5);
-                    x2 = Sse2.Xor(x2, x6);
-                    x3 = Sse2.Xor(x3, x7);
-                    x4 = Sse2.Xor(x4, x8);
+                    x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64());

-                    x1 = Sse2.Xor(x1, y5);
-                    x2 = Sse2.Xor(x2, y6);
-                    x3 = Sse2.Xor(x3, y7);
-                    x4 = Sse2.Xor(x4, y8);
+                    // k1, k2
+                    Vector128<ulong> x0 = Sse2.LoadVector128(localK05PolyPtr + 0x0);

                    localBufferPtr += 64;
                    length -= 64;
-                }
-
-                // Fold into 128-bits.
-                // k3, k4
-                x0 = Sse2.LoadVector128(k05PolyPtr + 0x2);

-                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
-                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
-                x1 = Sse2.Xor(x1, x2);
-                x1 = Sse2.Xor(x1, x5);
+                    // Parallel fold blocks of 64, if any.
+                    while (length >= 64)
+                    {
+                        x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
+                        Vector128<ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
+                        Vector128<ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00);
+                        Vector128<ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00);
+
+                        x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
+                        x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11);
+                        x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11);
+                        x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11);
+
+                        Vector128<ulong> y5 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x00));
+                        Vector128<ulong> y6 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x10));
+                        Vector128<ulong> y7 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x20));
+                        Vector128<ulong> y8 = Sse2.LoadVector128((ulong*)(localBufferPtr + 0x30));
+
+                        x1 = Sse2.Xor(x1, x5);
+                        x2 = Sse2.Xor(x2, x6);
+                        x3 = Sse2.Xor(x3, x7);
+                        x4 = Sse2.Xor(x4, x8);
+
+                        x1 = Sse2.Xor(x1, y5);
+                        x2 = Sse2.Xor(x2, y6);
+                        x3 = Sse2.Xor(x3, y7);
+                        x4 = Sse2.Xor(x4, y8);
+
+                        localBufferPtr += 64;
+                        length -= 64;
+                    }
+
+                    // Fold into 128-bits.
+                    // k3, k4
+                    x0 = Sse2.LoadVector128(k05PolyPtr + 0x2);

-                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
-                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
-                x1 = Sse2.Xor(x1, x3);
-                x1 = Sse2.Xor(x1, x5);
-
-                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
-                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
-                x1 = Sse2.Xor(x1, x4);
-                x1 = Sse2.Xor(x1, x5);
+                    x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
+                    x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
+                    x1 = Sse2.Xor(x1, x2);
+                    x1 = Sse2.Xor(x1, x5);

-                // Single fold blocks of 16, if any.
-                while (length >= 16)
-                {
-                    x2 = Sse2.LoadVector128((ulong*)localBufferPtr);
+                    x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
+                    x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
+                    x1 = Sse2.Xor(x1, x3);
+                    x1 = Sse2.Xor(x1, x5);

                    x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                    x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
-                    x1 = Sse2.Xor(x1, x2);
+                    x1 = Sse2.Xor(x1, x4);
                    x1 = Sse2.Xor(x1, x5);

-                    localBufferPtr += 16;
-                    length -= 16;
-                }
+                    // Single fold blocks of 16, if any.
+                    while (length >= 16)
+                    {
+                        x2 = Sse2.LoadVector128((ulong*)localBufferPtr);
+
+                        x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
+                        x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
+                        x1 = Sse2.Xor(x1, x2);
+                        x1 = Sse2.Xor(x1, x5);

-                // Fold 128 - bits to 64 - bits.
-                x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10);
-                x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86
-                x1 = Sse2.ShiftRightLogical128BitLane(x1, 8);
-                x1 = Sse2.Xor(x1, x2);
+                        localBufferPtr += 16;
+                        length -= 16;
+                    }

-                // k5, k0
-                x0 = Sse2.LoadScalarVector128(localK05PolyPtr + 0x4);
+                    // Fold 128 - bits to 64 - bits.
+                    x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10);
+                    x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86
+                    x1 = Sse2.ShiftRightLogical128BitLane(x1, 8);
+                    x1 = Sse2.Xor(x1, x2);

-                x2 = Sse2.ShiftRightLogical128BitLane(x1, 4);
-                x1 = Sse2.And(x1, x3);
-                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
-                x1 = Sse2.Xor(x1, x2);
+                    // k5, k0
+                    x0 = Sse2.LoadScalarVector128(localK05PolyPtr + 0x4);

-                // Barret reduce to 32-bits.
-                // polynomial
-                x0 = Sse2.LoadVector128(localK05PolyPtr + 0x6);
+                    x2 = Sse2.ShiftRightLogical128BitLane(x1, 4);
+                    x1 = Sse2.And(x1, x3);
+                    x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
+                    x1 = Sse2.Xor(x1, x2);

-                x2 = Sse2.And(x1, x3);
-                x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10);
-                x2 = Sse2.And(x2, x3);
-                x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
-                x1 = Sse2.Xor(x1, x2);
+                    // Barret reduce to 32-bits.
+                    // polynomial
+                    x0 = Sse2.LoadVector128(localK05PolyPtr + 0x6);

-                crc = (uint)Sse41.Extract(x1.AsInt32(), 1);
-                return buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer.Slice(chunksize));
+                    x2 = Sse2.And(x1, x3);
+                    x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10);
+                    x2 = Sse2.And(x2, x3);
+                    x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
+                    x1 = Sse2.Xor(x1, x2);
+
+                    crc = (uint)Sse41.Extract(x1.AsInt32(), 1);
+                    return buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer.Slice(chunksize));
+                }
            }
        }
 #endif
--- a/src/ImageSharp/Formats/Bmp/BmpConfigurationModule.cs
+++ b/src/ImageSharp/Formats/Bmp/BmpConfigurationModule.cs
@ -16,4 +16,4 @@ namespace SixLabors.ImageSharp.Formats.Bmp
            configuration.ImageFormatsManager.AddImageFormatDetector(new BmpImageFormatDetector());
        }
    }
-}
+}
--- a/src/ImageSharp/Formats/Bmp/BmpConstants.cs
+++ b/src/ImageSharp/Formats/Bmp/BmpConstants.cs
@ -56,4 +56,4 @@ namespace SixLabors.ImageSharp.Formats.Bmp
            public const int Pointer = 0x5450;
        }
    }
-}
+}
--- a/src/ImageSharp/Formats/Bmp/BmpFormat.cs
+++ b/src/ImageSharp/Formats/Bmp/BmpFormat.cs
@ -34,4 +34,4 @@ namespace SixLabors.ImageSharp.Formats.Bmp
        /// <inheritdoc/>
        public BmpMetadata CreateDefaultFormatMetadata() => new BmpMetadata();
    }
-}
+}
--- a/src/ImageSharp/Formats/Bmp/BmpMetadata.cs
+++ b/src/ImageSharp/Formats/Bmp/BmpMetadata.cs
@ -40,4 +40,4 @@ namespace SixLabors.ImageSharp.Formats.Bmp

        // TODO: Colors used once we support encoding palette bmps.
    }
-}
+}
--- a/src/ImageSharp/Formats/Bmp/IBmpDecoderOptions.cs
+++ b/src/ImageSharp/Formats/Bmp/IBmpDecoderOptions.cs
@ -13,4 +13,4 @@ namespace SixLabors.ImageSharp.Formats.Bmp
        /// </summary>
        RleSkippedPixelHandling RleSkippedPixelHandling { get; }
    }
-}
+}
--- a/src/ImageSharp/Formats/Gif/GifConfigurationModule.cs
+++ b/src/ImageSharp/Formats/Gif/GifConfigurationModule.cs
@ -16,4 +16,4 @@ namespace SixLabors.ImageSharp.Formats.Gif
            configuration.ImageFormatsManager.AddImageFormatDetector(new GifImageFormatDetector());
        }
    }
-}
+}
--- a/src/ImageSharp/Formats/Gif/GifDecoderCore.cs
+++ b/src/ImageSharp/Formats/Gif/GifDecoderCore.cs
@ -8,7 +8,6 @@ using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Text;
 using System.Threading;
-
 using SixLabors.ImageSharp.IO;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.Metadata;
--- a/src/ImageSharp/Formats/Gif/GifDisposalMethod.cs
+++ b/src/ImageSharp/Formats/Gif/GifDisposalMethod.cs
@ -35,4 +35,4 @@ namespace SixLabors.ImageSharp.Formats.Gif
        /// </summary>
        RestoreToPrevious = 3
    }
-}
+}
--- a/src/ImageSharp/Formats/Gif/GifFormat.cs
+++ b/src/ImageSharp/Formats/Gif/GifFormat.cs
@ -37,4 +37,4 @@ namespace SixLabors.ImageSharp.Formats.Gif
        /// <inheritdoc/>
        public GifFrameMetadata CreateDefaultFormatFrameMetadata() => new GifFrameMetadata();
    }
-}
+}
--- a/src/ImageSharp/Formats/Gif/GifImageFormatDetector.cs
+++ b/src/ImageSharp/Formats/Gif/GifImageFormatDetector.cs
@ -30,4 +30,4 @@ namespace SixLabors.ImageSharp.Formats.Gif
                   header[5] == 0x61;   // a
        }
    }
-}
+}
--- a/src/ImageSharp/Formats/Gif/LzwEncoder.cs
+++ b/src/ImageSharp/Formats/Gif/LzwEncoder.cs
@ -6,7 +6,6 @@ using System.Buffers;
 using System.IO;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-
 using SixLabors.ImageSharp.Memory;

 namespace SixLabors.ImageSharp.Formats.Gif
--- a/src/ImageSharp/Formats/Gif/Sections/GifGraphicControlExtension.cs
+++ b/src/ImageSharp/Formats/Gif/Sections/GifGraphicControlExtension.cs
@ -103,4 +103,4 @@ namespace SixLabors.ImageSharp.Formats.Gif
            return value;
        }
    }
-}
+}
--- a/src/ImageSharp/Formats/Gif/Sections/GifImageDescriptor.cs
+++ b/src/ImageSharp/Formats/Gif/Sections/GifImageDescriptor.cs
@ -113,4 +113,4 @@ namespace SixLabors.ImageSharp.Formats.Gif
            return value;
        }
    }
-}
+}
--- a/src/ImageSharp/Formats/Gif/Sections/GifLogicalScreenDescriptor.cs
+++ b/src/ImageSharp/Formats/Gif/Sections/GifLogicalScreenDescriptor.cs
@ -130,4 +130,4 @@ namespace SixLabors.ImageSharp.Formats.Gif
            return value;
        }
    }
-}
+}
--- a/src/ImageSharp/Formats/Gif/Sections/IGifExtension.cs
+++ b/src/ImageSharp/Formats/Gif/Sections/IGifExtension.cs
@ -22,4 +22,4 @@ namespace SixLabors.ImageSharp.Formats.Gif
        /// <returns>The number of bytes written to the buffer.</returns>
        int WriteTo(Span<byte> buffer);
    }
-}
+}
--- a/src/ImageSharp/Formats/IImageFormat.cs
+++ b/src/ImageSharp/Formats/IImageFormat.cs
@ -60,4 +60,4 @@ namespace SixLabors.ImageSharp.Formats
        /// <returns>The <typeparamref name="TFormatFrameMetadata"/>.</returns>
        TFormatFrameMetadata CreateDefaultFormatFrameMetadata();
    }
-}
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@ -2,17 +2,22 @@
 // Licensed under the Apache License, Version 2.0.

 using System;
-using System.Diagnostics;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 using System.Text;

 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
    /// <summary>
-    /// Represents a Jpeg block with <see cref="short"/> coefficients.
+    /// 8x8 matrix of <see cref="short"/> coefficients.
    /// </summary>
    // ReSharper disable once InconsistentNaming
+    [StructLayout(LayoutKind.Explicit)]
    internal unsafe struct Block8x8 : IEquatable<Block8x8>
    {
        /// <summary>
@ -20,24 +25,44 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// </summary>
        public const int Size = 64;

+#pragma warning disable IDE0051 // Remove unused private member
        /// <summary>
-        /// A fixed size buffer holding the values.
-        /// See: <see>
-        ///         <cref>https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/unsafe-code-pointers/fixed-size-buffers</cref>
-        ///     </see>
+        /// A placeholder buffer so the actual struct occupies exactly 64 * 2 bytes.
        /// </summary>
+        /// <remarks>
+        /// This is not used directly in the code.
+        /// </remarks>
+        [FieldOffset(0)]
        private fixed short data[Size];
-
-        /// <summary>
-        /// Initializes a new instance of the <see cref="Block8x8"/> struct.
-        /// </summary>
-        /// <param name="coefficients">A <see cref="Span{T}"/> of coefficients</param>
-        public Block8x8(Span<short> coefficients)
-        {
-            ref byte selfRef = ref Unsafe.As<Block8x8, byte>(ref this);
-            ref byte sourceRef = ref Unsafe.As<short, byte>(ref MemoryMarshal.GetReference(coefficients));
-            Unsafe.CopyBlock(ref selfRef, ref sourceRef, Size * sizeof(short));
-        }
+#pragma warning restore IDE0051
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [FieldOffset(0)]
+        public Vector128<short> V0;
+        [FieldOffset(16)]
+        public Vector128<short> V1;
+        [FieldOffset(32)]
+        public Vector128<short> V2;
+        [FieldOffset(48)]
+        public Vector128<short> V3;
+        [FieldOffset(64)]
+        public Vector128<short> V4;
+        [FieldOffset(80)]
+        public Vector128<short> V5;
+        [FieldOffset(96)]
+        public Vector128<short> V6;
+        [FieldOffset(112)]
+        public Vector128<short> V7;
+
+        [FieldOffset(0)]
+        public Vector256<short> V01;
+        [FieldOffset(32)]
+        public Vector256<short> V23;
+        [FieldOffset(64)]
+        public Vector256<short> V45;
+        [FieldOffset(96)]
+        public Vector256<short> V67;
+#endif

        /// <summary>
        /// Gets or sets a <see cref="short"/> value at the given index
@ -49,7 +74,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            get
            {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
+
                ref short selfRef = ref Unsafe.As<Block8x8, short>(ref this);
                return Unsafe.Add(ref selfRef, idx);
            }
@ -57,7 +83,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            set
            {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
+
                ref short selfRef = ref Unsafe.As<Block8x8, short>(ref this);
                Unsafe.Add(ref selfRef, idx) = value;
            }
@ -75,15 +102,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            set => this[(y * 8) + x] = value;
        }

-        public static bool operator ==(Block8x8 left, Block8x8 right)
-        {
-            return left.Equals(right);
-        }
+        public static bool operator ==(Block8x8 left, Block8x8 right) => left.Equals(right);

-        public static bool operator !=(Block8x8 left, Block8x8 right)
-        {
-            return !left.Equals(right);
-        }
+        public static bool operator !=(Block8x8 left, Block8x8 right) => !left.Equals(right);

        /// <summary>
        /// Multiply all elements by a given <see cref="int"/>
@ -149,34 +170,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            return result;
        }

-        /// <summary>
-        /// Pointer-based "Indexer" (getter part)
-        /// </summary>
-        /// <param name="blockPtr">Block pointer</param>
-        /// <param name="idx">Index</param>
-        /// <returns>The scaleVec value at the specified index</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static short GetScalarAt(Block8x8* blockPtr, int idx)
-        {
-            GuardBlockIndex(idx);
-
-            short* fp = blockPtr->data;
-            return fp[idx];
-        }
-
-        /// <summary>
-        /// Pointer-based "Indexer" (setter part)
-        /// </summary>
-        /// <param name="blockPtr">Block pointer</param>
-        /// <param name="idx">Index</param>
-        /// <param name="value">Value</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void SetScalarAt(Block8x8* blockPtr, int idx, short value)
+        public static Block8x8 Load(Span<short> data)
        {
-            GuardBlockIndex(idx);
-
-            short* fp = blockPtr->data;
-            fp[idx] = value;
+            Unsafe.SkipInit(out Block8x8 result);
+            result.LoadFrom(data);
+            return result;
        }

        /// <summary>
@ -194,7 +192,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        /// </summary>
        public short[] ToArray()
        {
-            var result = new short[Size];
+            short[] result = new short[Size];
            this.CopyTo(result);
            return result;
        }
@ -206,7 +204,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        {
            ref byte selfRef = ref Unsafe.As<Block8x8, byte>(ref this);
            ref byte destRef = ref MemoryMarshal.GetReference(MemoryMarshal.Cast<short, byte>(destination));
-            Unsafe.CopyBlock(ref destRef, ref selfRef, Size * sizeof(short));
+            Unsafe.CopyBlockUnaligned(ref destRef, ref selfRef, Size * sizeof(short));
        }

        /// <summary>
@ -220,6 +218,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            }
        }

+        /// <summary>
+        /// Load raw 16bit integers from source.
+        /// </summary>
+        /// <param name="source">Source</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public void LoadFrom(Span<short> source)
+        {
+            ref byte sourceRef = ref Unsafe.As<short, byte>(ref MemoryMarshal.GetReference(source));
+            ref byte destRef = ref Unsafe.As<Block8x8, byte>(ref this);
+
+            Unsafe.CopyBlockUnaligned(ref destRef, ref sourceRef, Size * sizeof(short));
+        }
+
        /// <summary>
        /// Cast and copy <see cref="Size"/> <see cref="int"/>-s from the beginning of 'source' span.
        /// </summary>
@ -231,13 +242,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            }
        }

-        [Conditional("DEBUG")]
-        private static void GuardBlockIndex(int idx)
-        {
-            DebugGuard.MustBeLessThan(idx, Size, nameof(idx));
-            DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx));
-        }
-
        /// <inheritdoc />
        public override string ToString()
        {
@ -271,15 +275,66 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        }

        /// <inheritdoc />
-        public override bool Equals(object obj)
-        {
-            return obj is Block8x8 other && this.Equals(other);
-        }
+        public override bool Equals(object obj) => obj is Block8x8 other && this.Equals(other);

        /// <inheritdoc />
-        public override int GetHashCode()
+        public override int GetHashCode() => (this[0] * 31) + this[1];
+
+        /// <summary>
+        /// Returns index of the last non-zero element in given matrix.
+        /// </summary>
+        /// <returns>
+        /// Index of the last non-zero element. Returns -1 if all elements are equal to zero.
+        /// </returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public nint GetLastNonZeroIndex()
        {
-            return (this[0] * 31) + this[1];
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
+            {
+                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+
+                Vector256<short> zero16 = Vector256<short>.Zero;
+
+                ref Vector256<short> mcuStride = ref Unsafe.As<Block8x8, Vector256<short>>(ref this);
+
+                for (nint i = 3; i >= 0; i--)
+                {
+                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i), zero16).AsByte());
+
+                    if (areEqual != equalityMask)
+                    {
+                        // Each 2 bits represents comparison operation for each 2-byte element in input vectors
+                        // LSB represents first element in the stride
+                        // MSB represents last element in the stride
+                        // lzcnt operation would calculate number of zero numbers at the end
+
+                        // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements
+                        // So we need to invert it
+                        int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual);
+
+                        // As input number is represented by 2 bits in the mask, we need to divide lzcnt result by 2
+                        // to get the exact number of zero elements in the stride
+                        int strideRelativeIndex = 15 - (lzcnt / 2);
+                        return (i * 16) + strideRelativeIndex;
+                    }
+                }
+
+                return -1;
+            }
+            else
+#endif
+            {
+                nint index = Size - 1;
+                ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);
+
+                while (index >= 0 && Unsafe.Add(ref elemRef, index) == 0)
+                {
+                    index--;
+                }
+
+                return index;
+            }
        }

        /// <summary>
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@ -0,0 +1,149 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+    internal partial struct Block8x8F
+    {
+        /// <summary>
+        /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
+        /// </summary>
+        public const int RowCount = 8;
+
+        [FieldOffset(0)]
+        public Vector256<float> V0;
+        [FieldOffset(32)]
+        public Vector256<float> V1;
+        [FieldOffset(64)]
+        public Vector256<float> V2;
+        [FieldOffset(96)]
+        public Vector256<float> V3;
+        [FieldOffset(128)]
+        public Vector256<float> V4;
+        [FieldOffset(160)]
+        public Vector256<float> V5;
+        [FieldOffset(192)]
+        public Vector256<float> V6;
+        [FieldOffset(224)]
+        public Vector256<float> V7;
+
+        private static readonly Vector256<int> MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
+
+        private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
+
+            ref Vector256<float> aBase = ref a.V0;
+            ref Vector256<float> bBase = ref b.V0;
+
+            ref Vector256<short> destRef = ref dest.V01;
+
+            for (nint i = 0; i < 8; i += 2)
+            {
+                Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+                Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
+                row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16();
+
+                Unsafe.Add(ref destRef, (IntPtr)((uint)i / 2)) = row;
+            }
+        }
+
+        private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+        {
+            DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
+
+            ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
+            ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
+
+            ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
+
+            for (int i = 0; i < 16; i += 2)
+            {
+                Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+                Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+                Vector128<short> row = Sse2.PackSignedSaturate(left, right);
+                Unsafe.Add(ref destBase, (IntPtr)((uint)i / 2)) = row;
+            }
+        }
+
+        private void TransposeInplace_Avx()
+        {
+            // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
+            Vector256<float> r0 = Avx.InsertVector128(
+                this.V0,
+                Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
+                1);
+
+            Vector256<float> r1 = Avx.InsertVector128(
+               this.V1,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
+               1);
+
+            Vector256<float> r2 = Avx.InsertVector128(
+               this.V2,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
+               1);
+
+            Vector256<float> r3 = Avx.InsertVector128(
+               this.V3,
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
+               1);
+
+            Vector256<float> r4 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
+               1);
+
+            Vector256<float> r5 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
+               1);
+
+            Vector256<float> r6 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
+               1);
+
+            Vector256<float> r7 = Avx.InsertVector128(
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
+               Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
+               1);
+
+            Vector256<float> t0 = Avx.UnpackLow(r0, r1);
+            Vector256<float> t2 = Avx.UnpackLow(r2, r3);
+            Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
+            this.V0 = Avx.Blend(t0, v, 0xCC);
+            this.V1 = Avx.Blend(t2, v, 0x33);
+
+            Vector256<float> t4 = Avx.UnpackLow(r4, r5);
+            Vector256<float> t6 = Avx.UnpackLow(r6, r7);
+            v = Avx.Shuffle(t4, t6, 0x4E);
+            this.V4 = Avx.Blend(t4, v, 0xCC);
+            this.V5 = Avx.Blend(t6, v, 0x33);
+
+            Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
+            Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
+            v = Avx.Shuffle(t1, t3, 0x4E);
+            this.V2 = Avx.Blend(t1, v, 0xCC);
+            this.V3 = Avx.Blend(t3, v, 0x33);
+
+            Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
+            Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
+            v = Avx.Shuffle(t5, t7, 0x4E);
+            this.V6 = Avx.Blend(t5, v, 0xCC);
+            this.V7 = Avx.Blend(t7, v, 0x33);
+        }
+    }
+}
+#endif
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

 using System.Numerics;
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@ -16,7 +16,7 @@ using System.Text;
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
 {
    /// <summary>
-    /// Represents a Jpeg block with <see cref="float"/> coefficients.
+    /// 8x8 matrix of <see cref="float"/> coefficients.
    /// </summary>
    [StructLayout(LayoutKind.Explicit)]
    internal partial struct Block8x8F : IEquatable<Block8x8F>
@ -66,30 +66,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        public Vector4 V7L;
        [FieldOffset(240)]
        public Vector4 V7R;
-
-#if SUPPORTS_RUNTIME_INTRINSICS
-        /// <summary>
-        /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
-        /// </summary>
-        public const int RowCount = 8;
-
-        [FieldOffset(0)]
-        public Vector256<float> V0;
-        [FieldOffset(32)]
-        public Vector256<float> V1;
-        [FieldOffset(64)]
-        public Vector256<float> V2;
-        [FieldOffset(96)]
-        public Vector256<float> V3;
-        [FieldOffset(128)]
-        public Vector256<float> V4;
-        [FieldOffset(160)]
-        public Vector256<float> V5;
-        [FieldOffset(192)]
-        public Vector256<float> V6;
-        [FieldOffset(224)]
-        public Vector256<float> V7;
-#endif
 #pragma warning restore SA1600 // ElementsMustBeDocumented

        /// <summary>
@ -102,17 +78,17 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            get
            {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
                ref float selfRef = ref Unsafe.As<Block8x8F, float>(ref this);
-                return Unsafe.Add(ref selfRef, idx);
+                return Unsafe.Add(ref selfRef, (nint)(uint)idx);
            }

            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            set
            {
-                GuardBlockIndex(idx);
+                DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
                ref float selfRef = ref Unsafe.As<Block8x8F, float>(ref this);
-                Unsafe.Add(ref selfRef, idx) = value;
+                Unsafe.Add(ref selfRef, (nint)(uint)idx) = value;
            }
        }

@ -188,13 +164,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            return result;
        }

-        /// <summary>
-        /// Fill the block with defaults (zeroes).
-        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public void Clear()
-            => this = default; // The cheapest way to do this in C#:
-
        /// <summary>
        /// Load raw 32bit floating point data from source.
        /// </summary>
@ -302,7 +271,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components

        public float[] ToArray()
        {
-            var result = new float[Size];
+            float[] result = new float[Size];
            this.ScaledCopyTo(result);
            return result;
        }
@ -434,102 +403,37 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
        }

        /// <summary>
-        /// Quantize the block.
+        /// Quantize input block, apply zig-zag ordering and store result as 16bit integers.
        /// </summary>
-        /// <param name="blockPtr">The block pointer.</param>
-        /// <param name="qtPtr">The qt pointer.</param>
-        /// <param name="unzigPtr">Unzig pointer</param>
-        public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr)
-        {
-            float* b = (float*)blockPtr;
-            float* qtp = (float*)qtPtr;
-            for (int qtIndex = 0; qtIndex < Size; qtIndex++)
-            {
-                byte blockIndex = unzigPtr[qtIndex];
-                float* unzigPos = b + blockIndex;
-
-                float val = *unzigPos;
-                val *= qtp[qtIndex];
-                *unzigPos = val;
-            }
-        }
-
-        /// <summary>
-        /// Quantize 'block' into 'dest' using the 'qt' quantization table:
-        /// Unzig the elements of block into dest, while dividing them by elements of qt and "pre-rounding" the values.
-        /// To finish the rounding it's enough to (int)-cast these values.
-        /// </summary>
-        /// <param name="block">Source block</param>
-        /// <param name="dest">Destination block</param>
-        /// <param name="qt">The quantization table</param>
-        /// <param name="unZig">The 8x8 Unzig block.</param>
-        public static unsafe void Quantize(
-            ref Block8x8F block,
-            ref Block8x8F dest,
-            ref Block8x8F qt,
-            ref ZigZag unZig)
+        /// <param name="block">Source block.</param>
+        /// <param name="dest">Destination block.</param>
+        /// <param name="qt">The quantization table.</param>
+        public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt)
        {
-            for (int zig = 0; zig < Size; zig++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
            {
-                dest[zig] = block[unZig[zig]];
+                MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
+                ZigZag.ApplyZigZagOrderingAvx2(ref dest);
            }
-
-            DivideRoundAll(ref dest, ref qt);
-        }
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx.IsSupported)
+            else if (Ssse3.IsSupported)
            {
-                var vnegOne = Vector256.Create(-1f);
-                var vadd = Vector256.Create(.5F);
-                var vone = Vector256.Create(1f);
-
-                for (int i = 0; i < RowCount; i++)
-                {
-                    ref Vector256<float> aRow = ref Unsafe.Add(ref a.V0, i);
-                    ref Vector256<float> bRow = ref Unsafe.Add(ref b.V0, i);
-                    Vector256<float> voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aRow), vone), vadd);
-                    aRow = Avx.Add(Avx.Divide(aRow, bRow), voff);
-                }
+                MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
+                ZigZag.ApplyZigZagOrderingSsse3(ref dest);
            }
            else
 #endif
            {
-                a.V0L = DivideRound(a.V0L, b.V0L);
-                a.V0R = DivideRound(a.V0R, b.V0R);
-                a.V1L = DivideRound(a.V1L, b.V1L);
-                a.V1R = DivideRound(a.V1R, b.V1R);
-                a.V2L = DivideRound(a.V2L, b.V2L);
-                a.V2R = DivideRound(a.V2R, b.V2R);
-                a.V3L = DivideRound(a.V3L, b.V3L);
-                a.V3R = DivideRound(a.V3R, b.V3R);
-                a.V4L = DivideRound(a.V4L, b.V4L);
-                a.V4R = DivideRound(a.V4R, b.V4R);
-                a.V5L = DivideRound(a.V5L, b.V5L);
-                a.V5R = DivideRound(a.V5R, b.V5R);
-                a.V6L = DivideRound(a.V6L, b.V6L);
-                a.V6R = DivideRound(a.V6R, b.V6R);
-                a.V7L = DivideRound(a.V7L, b.V7L);
-                a.V7R = DivideRound(a.V7R, b.V7R);
+                for (int i = 0; i < Size; i++)
+                {
+                    int idx = ZigZag.ZigZagOrder[i];
+                    float quantizedVal = block[idx] * qt[idx];
+                    quantizedVal += quantizedVal < 0 ? -0.5f : 0.5f;
+                    dest[i] = (short)quantizedVal;
+                }
            }
        }

-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor)
-        {
-            var neg = new Vector4(-1);
-            var add = new Vector4(.5F);
-
-            // sign(dividend) = max(min(dividend, 1), -1)
-            Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One);
-
-            // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend)
-            return (dividend / divisor) + (sign * add);
-        }
-
        public void RoundInto(ref Block8x8 dest)
        {
            for (int i = 0; i < Size; i++)
@ -627,6 +531,47 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            Unsafe.Add(ref dRef, 7) = bottom;
        }

+        /// <summary>
+        /// Compares entire 8x8 block to a single scalar value.
+        /// </summary>
+        /// <param name="value">Value to compare to.</param>
+        public bool EqualsToScalar(int value)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported)
+            {
+                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+
+                var targetVector = Vector256.Create(value);
+                ref Vector256<float> blockStride = ref this.V0;
+
+                for (int i = 0; i < RowCount; i++)
+                {
+                    Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector);
+                    if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
+                    {
+                        return false;
+                    }
+                }
+
+                return true;
+            }
+#endif
+            {
+                ref float scalars = ref Unsafe.As<Block8x8F, float>(ref this);
+
+                for (int i = 0; i < Size; i++)
+                {
+                    if ((int)Unsafe.Add(ref scalars, i) != value)
+                    {
+                        return false;
+                    }
+                }
+
+                return true;
+            }
+        }
+
        /// <inheritdoc />
        public bool Equals(Block8x8F other)
            => this.V0L == other.V0L
@ -663,213 +608,89 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
            return sb.ToString();
        }

-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static Vector<float> NormalizeAndRound(Vector<float> row, Vector<float> off, Vector<float> max)
-        {
-            row += off;
-            row = Vector.Max(row, Vector<float>.Zero);
-            row = Vector.Min(row, max);
-            return row.FastRound();
-        }
-
-        [Conditional("DEBUG")]
-        private static void GuardBlockIndex(int idx)
-        {
-            DebugGuard.MustBeLessThan(idx, Size, nameof(idx));
-            DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx));
-        }
-
        /// <summary>
-        /// Transpose the block into the destination block.
+        /// Transpose the block inplace.
        /// </summary>
-        /// <param name="d">The destination block</param>
        [MethodImpl(InliningOptions.ShortMethod)]
-        public void TransposeInto(ref Block8x8F d)
+        public void TransposeInplace()
        {
 #if SUPPORTS_RUNTIME_INTRINSICS
            if (Avx.IsSupported)
            {
-                // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
-                Vector256<float> r0 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V0L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
-                   1);
-
-                Vector256<float> r1 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V1L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
-                   1);
-
-                Vector256<float> r2 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V2L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
-                   1);
-
-                Vector256<float> r3 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V3L).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
-                   1);
-
-                Vector256<float> r4 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
-                   1);
-
-                Vector256<float> r5 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
-                   1);
-
-                Vector256<float> r6 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
-                   1);
-
-                Vector256<float> r7 = Avx.InsertVector128(
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
-                   Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
-                   1);
-
-                Vector256<float> t0 = Avx.UnpackLow(r0, r1);
-                Vector256<float> t2 = Avx.UnpackLow(r2, r3);
-                Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
-                d.V0 = Avx.Blend(t0, v, 0xCC);
-                d.V1 = Avx.Blend(t2, v, 0x33);
-
-                Vector256<float> t4 = Avx.UnpackLow(r4, r5);
-                Vector256<float> t6 = Avx.UnpackLow(r6, r7);
-                v = Avx.Shuffle(t4, t6, 0x4E);
-                d.V4 = Avx.Blend(t4, v, 0xCC);
-                d.V5 = Avx.Blend(t6, v, 0x33);
-
-                Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
-                Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
-                v = Avx.Shuffle(t1, t3, 0x4E);
-                d.V2 = Avx.Blend(t1, v, 0xCC);
-                d.V3 = Avx.Blend(t3, v, 0x33);
-
-                Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
-                Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
-                v = Avx.Shuffle(t5, t7, 0x4E);
-                d.V6 = Avx.Blend(t5, v, 0xCC);
-                d.V7 = Avx.Blend(t7, v, 0x33);
+                this.TransposeInplace_Avx();
            }
            else
 #endif
            {
-                d.V0L.X = this.V0L.X;
-                d.V1L.X = this.V0L.Y;
-                d.V2L.X = this.V0L.Z;
-                d.V3L.X = this.V0L.W;
-                d.V4L.X = this.V0R.X;
-                d.V5L.X = this.V0R.Y;
-                d.V6L.X = this.V0R.Z;
-                d.V7L.X = this.V0R.W;
-
-                d.V0L.Y = this.V1L.X;
-                d.V1L.Y = this.V1L.Y;
-                d.V2L.Y = this.V1L.Z;
-                d.V3L.Y = this.V1L.W;
-                d.V4L.Y = this.V1R.X;
-                d.V5L.Y = this.V1R.Y;
-                d.V6L.Y = this.V1R.Z;
-                d.V7L.Y = this.V1R.W;
-
-                d.V0L.Z = this.V2L.X;
-                d.V1L.Z = this.V2L.Y;
-                d.V2L.Z = this.V2L.Z;
-                d.V3L.Z = this.V2L.W;
-                d.V4L.Z = this.V2R.X;
-                d.V5L.Z = this.V2R.Y;
-                d.V6L.Z = this.V2R.Z;
-                d.V7L.Z = this.V2R.W;
-
-                d.V0L.W = this.V3L.X;
-                d.V1L.W = this.V3L.Y;
-                d.V2L.W = this.V3L.Z;
-                d.V3L.W = this.V3L.W;
-                d.V4L.W = this.V3R.X;
-                d.V5L.W = this.V3R.Y;
-                d.V6L.W = this.V3R.Z;
-                d.V7L.W = this.V3R.W;
-
-                d.V0R.X = this.V4L.X;
-                d.V1R.X = this.V4L.Y;
-                d.V2R.X = this.V4L.Z;
-                d.V3R.X = this.V4L.W;
-                d.V4R.X = this.V4R.X;
-                d.V5R.X = this.V4R.Y;
-                d.V6R.X = this.V4R.Z;
-                d.V7R.X = this.V4R.W;
-
-                d.V0R.Y = this.V5L.X;
-                d.V1R.Y = this.V5L.Y;
-                d.V2R.Y = this.V5L.Z;
-                d.V3R.Y = this.V5L.W;
-                d.V4R.Y = this.V5R.X;
-                d.V5R.Y = this.V5R.Y;
-                d.V6R.Y = this.V5R.Z;
-                d.V7R.Y = this.V5R.W;
-
-                d.V0R.Z = this.V6L.X;
-                d.V1R.Z = this.V6L.Y;
-                d.V2R.Z = this.V6L.Z;
-                d.V3R.Z = this.V6L.W;
-                d.V4R.Z = this.V6R.X;
-                d.V5R.Z = this.V6R.Y;
-                d.V6R.Z = this.V6R.Z;
-                d.V7R.Z = this.V6R.W;
-
-                d.V0R.W = this.V7L.X;
-                d.V1R.W = this.V7L.Y;
-                d.V2R.W = this.V7L.Z;
-                d.V3R.W = this.V7L.W;
-                d.V4R.W = this.V7R.X;
-                d.V5R.W = this.V7R.Y;
-                d.V6R.W = this.V7R.Z;
-                d.V7R.W = this.V7R.W;
+                this.TransposeInplace_Scalar();
            }
        }

        /// <summary>
-        /// Compares entire 8x8 block to a single scalar value.
+        /// Scalar inplace transpose implementation for <see cref="TransposeInplace"/>
        /// </summary>
-        /// <param name="value">Value to compare to.</param>
-        public bool EqualsToScalar(int value)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void TransposeInplace_Scalar()
+        {
+            ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref this);
+
+            // row #0
+            Swap(ref Unsafe.Add(ref elemRef, 1), ref Unsafe.Add(ref elemRef, 8));
+            Swap(ref Unsafe.Add(ref elemRef, 2), ref Unsafe.Add(ref elemRef, 16));
+            Swap(ref Unsafe.Add(ref elemRef, 3), ref Unsafe.Add(ref elemRef, 24));
+            Swap(ref Unsafe.Add(ref elemRef, 4), ref Unsafe.Add(ref elemRef, 32));
+            Swap(ref Unsafe.Add(ref elemRef, 5), ref Unsafe.Add(ref elemRef, 40));
+            Swap(ref Unsafe.Add(ref elemRef, 6), ref Unsafe.Add(ref elemRef, 48));
+            Swap(ref Unsafe.Add(ref elemRef, 7), ref Unsafe.Add(ref elemRef, 56));
+
+            // row #1
+            Swap(ref Unsafe.Add(ref elemRef, 10), ref Unsafe.Add(ref elemRef, 17));
+            Swap(ref Unsafe.Add(ref elemRef, 11), ref Unsafe.Add(ref elemRef, 25));
+            Swap(ref Unsafe.Add(ref elemRef, 12), ref Unsafe.Add(ref elemRef, 33));
+            Swap(ref Unsafe.Add(ref elemRef, 13), ref Unsafe.Add(ref elemRef, 41));
+            Swap(ref Unsafe.Add(ref elemRef, 14), ref Unsafe.Add(ref elemRef, 49));
+            Swap(ref Unsafe.Add(ref elemRef, 15), ref Unsafe.Add(ref elemRef, 57));
+
+            // row #2
+            Swap(ref Unsafe.Add(ref elemRef, 19), ref Unsafe.Add(ref elemRef, 26));
+            Swap(ref Unsafe.Add(ref elemRef, 20), ref Unsafe.Add(ref elemRef, 34));
+            Swap(ref Unsafe.Add(ref elemRef, 21), ref Unsafe.Add(ref elemRef, 42));
+            Swap(ref Unsafe.Add(ref elemRef, 22), ref Unsafe.Add(ref elemRef, 50));
+            Swap(ref Unsafe.Add(ref elemRef, 23), ref Unsafe.Add(ref elemRef, 58));
+
+            // row #3
+            Swap(ref Unsafe.Add(ref elemRef, 28), ref Unsafe.Add(ref elemRef, 35));
+            Swap(ref Unsafe.Add(ref elemRef, 29), ref Unsafe.Add(ref elemRef, 43));
+            Swap(ref Unsafe.Add(ref elemRef, 30), ref Unsafe.Add(ref elemRef, 51));
+            Swap(ref Unsafe.Add(ref elemRef, 31), ref Unsafe.Add(ref elemRef, 59));
+
+            // row #4
+            Swap(ref Unsafe.Add(ref elemRef, 37), ref Unsafe.Add(ref elemRef, 44));
+            Swap(ref Unsafe.Add(ref elemRef, 38), ref Unsafe.Add(ref elemRef, 52));
+            Swap(ref Unsafe.Add(ref elemRef, 39), ref Unsafe.Add(ref elemRef, 60));
+
+            // row #5
+            Swap(ref Unsafe.Add(ref elemRef, 46), ref Unsafe.Add(ref elemRef, 53));
+            Swap(ref Unsafe.Add(ref elemRef, 47), ref Unsafe.Add(ref elemRef, 61));
+
+            // row #6
+            Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62));
+
+            static void Swap(ref float a, ref float b)
            {
-                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
-
-                var targetVector = Vector256.Create(value);
-                ref Vector256<float> blockStride = ref this.V0;
-
-                for (int i = 0; i < RowCount; i++)
-                {
-                    Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector);
-                    if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
-                    {
-                        return false;
-                    }
-                }
-
-                return true;
+                float tmp = a;
+                a = b;
+                b = tmp;
            }
-#endif
-            {
-                ref float scalars = ref Unsafe.As<Block8x8F, float>(ref this);
-
-                for (int i = 0; i < Size; i++)
-                {
-                    if ((int)Unsafe.Add(ref scalars, i) != value)
-                    {
-                        return false;
-                    }
-                }
+        }

-                return true;
-            }
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static Vector<float> NormalizeAndRound(Vector<float> row, Vector<float> off, Vector<float> max)
+        {
+            row += off;
+            row = Vector.Max(row, Vector<float>.Zero);
+            row = Vector.Min(row, max);
+            return row.FastRound();
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/AdobeMarker.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/AdobeMarker.cs
@ -107,4 +107,4 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                this.ColorTransform);
        }
    }
-}
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromCmykAvx2.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromCmykAvx2.cs
@ -22,60 +22,39 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result)
+            protected override void ConvertCoreVectorizedInplace(in ComponentValues values)
            {
 #if SUPPORTS_RUNTIME_INTRINSICS
-                ref Vector256<float> cBase =
+                ref Vector256<float> c0Base =
                                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
-                ref Vector256<float> mBase =
+                ref Vector256<float> c1Base =
                                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
-                ref Vector256<float> yBase =
+                ref Vector256<float> c2Base =
                                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
-                ref Vector256<float> kBase =
+                ref Vector256<float> c3Base =
                                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3));

-                ref Vector256<float> resultBase =
-                    ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result));
-
                // Used for the color conversion
                var scale = Vector256.Create(1 / this.MaximumValue);
-                var one = Vector256.Create(1F);
-
-                // Used for packing
-                ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32);
-                Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);

-                int n = result.Length / 8;
-                for (int i = 0; i < n; i++)
+                nint n = values.Component0.Length / 8;
+                for (nint i = 0; i < n; i++)
                {
-                    Vector256<float> k = Avx2.PermuteVar8x32(Unsafe.Add(ref kBase, i), vcontrol);
-                    Vector256<float> c = Avx2.PermuteVar8x32(Unsafe.Add(ref cBase, i), vcontrol);
-                    Vector256<float> m = Avx2.PermuteVar8x32(Unsafe.Add(ref mBase, i), vcontrol);
-                    Vector256<float> y = Avx2.PermuteVar8x32(Unsafe.Add(ref yBase, i), vcontrol);
+                    ref Vector256<float> c = ref Unsafe.Add(ref c0Base, i);
+                    ref Vector256<float> m = ref Unsafe.Add(ref c1Base, i);
+                    ref Vector256<float> y = ref Unsafe.Add(ref c2Base, i);
+                    Vector256<float> k = Unsafe.Add(ref c3Base, i);

                    k = Avx.Multiply(k, scale);
-
                    c = Avx.Multiply(Avx.Multiply(c, k), scale);
                    m = Avx.Multiply(Avx.Multiply(m, k), scale);
                    y = Avx.Multiply(Avx.Multiply(y, k), scale);
-
-                    Vector256<float> cmLo = Avx.UnpackLow(c, m);
-                    Vector256<float> yoLo = Avx.UnpackLow(y, one);
-                    Vector256<float> cmHi = Avx.UnpackHigh(c, m);
-                    Vector256<float> yoHi = Avx.UnpackHigh(y, one);
-
-                    ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4);
-
-                    destination = Avx.Shuffle(cmLo, yoLo, 0b01_00_01_00);
-                    Unsafe.Add(ref destination, 1) = Avx.Shuffle(cmLo, yoLo, 0b11_10_11_10);
-                    Unsafe.Add(ref destination, 2) = Avx.Shuffle(cmHi, yoHi, 0b01_00_01_00);
-                    Unsafe.Add(ref destination, 3) = Avx.Shuffle(cmHi, yoHi, 0b11_10_11_10);
                }
 #endif
            }

-            protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) =>
-                FromCmykBasic.ConvertCore(values, result, this.MaximumValue);
+            protected override void ConvertCoreInplace(in ComponentValues values) =>
+                FromCmykBasic.ConvertCoreInplace(values, this.MaximumValue);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromCmykBasic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromCmykBasic.cs
@ -15,38 +15,27 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
-            {
-                ConvertCore(values, result, this.MaximumValue);
-            }
+            public override void ConvertToRgbInplace(in ComponentValues values) =>
+                ConvertCoreInplace(values, this.MaximumValue);

-            internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue)
+            internal static void ConvertCoreInplace(in ComponentValues values, float maxValue)
            {
-                ReadOnlySpan<float> cVals = values.Component0;
-                ReadOnlySpan<float> mVals = values.Component1;
-                ReadOnlySpan<float> yVals = values.Component2;
-                ReadOnlySpan<float> kVals = values.Component3;
-
-                var v = new Vector4(0, 0, 0, 1F);
-
-                var maximum = 1 / maxValue;
-                var scale = new Vector4(maximum, maximum, maximum, 1F);
+                Span<float> c0 = values.Component0;
+                Span<float> c1 = values.Component1;
+                Span<float> c2 = values.Component2;
+                Span<float> c3 = values.Component3;

-                for (int i = 0; i < result.Length; i++)
+                float scale = 1 / maxValue;
+                for (int i = 0; i < c0.Length; i++)
                {
-                    float c = cVals[i];
-                    float m = mVals[i];
-                    float y = yVals[i];
-                    float k = kVals[i] / maxValue;
-
-                    v.X = c * k;
-                    v.Y = m * k;
-                    v.Z = y * k;
-                    v.W = 1F;
-
-                    v *= scale;
-
-                    result[i] = v;
+                    float c = c0[i];
+                    float m = c1[i];
+                    float y = c2[i];
+                    float k = c3[i] / maxValue;
+
+                    c0[i] = c * k * scale;
+                    c1[i] = m * k * scale;
+                    c2[i] = y * k * scale;
                }
            }
        }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromCmykVector8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromCmykVector8.cs
@ -18,7 +18,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result)
+            protected override void ConvertCoreVectorizedInplace(in ComponentValues values)
            {
                ref Vector<float> cBase =
                                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0));
@ -29,43 +29,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
                ref Vector<float> kBase =
                                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component3));

-                ref Vector4Octet resultBase =
-                    ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result));
-
-                Vector4Pair cc = default;
-                Vector4Pair mm = default;
-                Vector4Pair yy = default;
-                ref Vector<float> ccRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref cc);
-                ref Vector<float> mmRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref mm);
-                ref Vector<float> yyRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref yy);
-
                var scale = new Vector<float>(1 / this.MaximumValue);

                // Walking 8 elements at one step:
-                int n = result.Length / 8;
-                for (int i = 0; i < n; i++)
+                nint n = values.Component0.Length / 8;
+                for (nint i = 0; i < n; i++)
                {
-                    Vector<float> c = Unsafe.Add(ref cBase, i);
-                    Vector<float> m = Unsafe.Add(ref mBase, i);
-                    Vector<float> y = Unsafe.Add(ref yBase, i);
+                    ref Vector<float> c = ref Unsafe.Add(ref cBase, i);
+                    ref Vector<float> m = ref Unsafe.Add(ref mBase, i);
+                    ref Vector<float> y = ref Unsafe.Add(ref yBase, i);
                    Vector<float> k = Unsafe.Add(ref kBase, i) * scale;

                    c = (c * k) * scale;
                    m = (m * k) * scale;
                    y = (y * k) * scale;
-
-                    ccRefAsVector = c;
-                    mmRefAsVector = m;
-                    yyRefAsVector = y;
-
-                    // Collect (c0,c1...c8) (m0,m1...m8) (y0,y1...y8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
-                    ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
-                    destination.Pack(ref cc, ref mm, ref yy);
                }
            }

-            protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) =>
-                FromCmykBasic.ConvertCore(values, result, this.MaximumValue);
+            protected override void ConvertCoreInplace(in ComponentValues values) =>
+                FromCmykBasic.ConvertCoreInplace(values, this.MaximumValue);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromGrayScaleAvx2.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromGrayScaleAvx2.cs
@ -22,42 +22,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result)
+            protected override void ConvertCoreVectorizedInplace(in ComponentValues values)
            {
 #if SUPPORTS_RUNTIME_INTRINSICS
-                ref Vector256<float> gBase =
+                ref Vector256<float> c0Base =
                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));

-                ref Vector256<float> resultBase =
-                    ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result));
-
                // Used for the color conversion
                var scale = Vector256.Create(1 / this.MaximumValue);
-                var one = Vector256.Create(1F);
-
-                // Used for packing
-                ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32);
-                Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);

-                int n = result.Length / 8;
-                for (int i = 0; i < n; i++)
+                nint n = values.Component0.Length / 8;
+                for (nint i = 0; i < n; i++)
                {
-                    Vector256<float> g = Avx.Multiply(Unsafe.Add(ref gBase, i), scale);
-
-                    g = Avx2.PermuteVar8x32(g, vcontrol);
-
-                    ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4);
-
-                    destination = Avx.Blend(Avx.Permute(g, 0b00_00_00_00), one, 0b1000_1000);
-                    Unsafe.Add(ref destination, 1) = Avx.Blend(Avx.Shuffle(g, g, 0b01_01_01_01), one, 0b1000_1000);
-                    Unsafe.Add(ref destination, 2) = Avx.Blend(Avx.Shuffle(g, g, 0b10_10_10_10), one, 0b1000_1000);
-                    Unsafe.Add(ref destination, 3) = Avx.Blend(Avx.Shuffle(g, g, 0b11_11_11_11), one, 0b1000_1000);
+                    ref Vector256<float> c0 = ref Unsafe.Add(ref c0Base, i);
+                    c0 = Avx.Multiply(c0, scale);
                }
 #endif
            }

-            protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) =>
-                FromGrayscaleBasic.ConvertCore(values, result, this.MaximumValue);
+            protected override void ConvertCoreInplace(in ComponentValues values) =>
+                FromGrayscaleBasic.ScaleValues(values.Component0, this.MaximumValue);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromGrayScaleBasic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromGrayScaleBasic.cs
@ -17,25 +17,35 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
-            {
-                ConvertCore(values, result, this.MaximumValue);
-            }
+            public override void ConvertToRgbInplace(in ComponentValues values) =>
+                ScaleValues(values.Component0, this.MaximumValue);

-            internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue)
+            internal static void ScaleValues(Span<float> values, float maxValue)
            {
-                var maximum = 1 / maxValue;
-                var scale = new Vector4(maximum, maximum, maximum, 1F);
+                Span<Vector4> vecValues = MemoryMarshal.Cast<float, Vector4>(values);

-                ref float sBase = ref MemoryMarshal.GetReference(values.Component0);
-                ref Vector4 dBase = ref MemoryMarshal.GetReference(result);
+                var scaleVector = new Vector4(1 / maxValue);

-                for (int i = 0; i < result.Length; i++)
+                for (int i = 0; i < vecValues.Length; i++)
                {
-                    var v = new Vector4(Unsafe.Add(ref sBase, i));
-                    v.W = 1f;
-                    v *= scale;
-                    Unsafe.Add(ref dBase, i) = v;
+                    vecValues[i] *= scaleVector;
+                }
+
+                values = values.Slice(vecValues.Length * 4);
+                if (!values.IsEmpty)
+                {
+                    float scaleValue = 1f / maxValue;
+                    values[0] *= scaleValue;
+
+                    if ((uint)values.Length > 1)
+                    {
+                        values[1] *= scaleValue;
+
+                        if ((uint)values.Length > 2)
+                        {
+                            values[2] *= scaleValue;
+                        }
+                    }
                }
            }
        }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromRgbAvx2.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromRgbAvx2.cs
@ -22,7 +22,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result)
+            protected override void ConvertCoreVectorizedInplace(in ComponentValues values)
            {
 #if SUPPORTS_RUNTIME_INTRINSICS
                ref Vector256<float> rBase =
@ -32,41 +32,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
                ref Vector256<float> bBase =
                                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));

-                ref Vector256<float> resultBase =
-                    ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result));
-
                // Used for the color conversion
                var scale = Vector256.Create(1 / this.MaximumValue);
-                var one = Vector256.Create(1F);
-
-                // Used for packing
-                ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32);
-                Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);
-
-                int n = result.Length / 8;
-                for (int i = 0; i < n; i++)
+                nint n = values.Component0.Length / 8;
+                for (nint i = 0; i < n; i++)
                {
-                    Vector256<float> r = Avx.Multiply(Avx2.PermuteVar8x32(Unsafe.Add(ref rBase, i), vcontrol), scale);
-                    Vector256<float> g = Avx.Multiply(Avx2.PermuteVar8x32(Unsafe.Add(ref gBase, i), vcontrol), scale);
-                    Vector256<float> b = Avx.Multiply(Avx2.PermuteVar8x32(Unsafe.Add(ref bBase, i), vcontrol), scale);
-
-                    Vector256<float> rgLo = Avx.UnpackLow(r, g);
-                    Vector256<float> boLo = Avx.UnpackLow(b, one);
-                    Vector256<float> rgHi = Avx.UnpackHigh(r, g);
-                    Vector256<float> boHi = Avx.UnpackHigh(b, one);
-
-                    ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4);
-
-                    destination = Avx.Shuffle(rgLo, boLo, 0b01_00_01_00);
-                    Unsafe.Add(ref destination, 1) = Avx.Shuffle(rgLo, boLo, 0b11_10_11_10);
-                    Unsafe.Add(ref destination, 2) = Avx.Shuffle(rgHi, boHi, 0b01_00_01_00);
-                    Unsafe.Add(ref destination, 3) = Avx.Shuffle(rgHi, boHi, 0b11_10_11_10);
+                    ref Vector256<float> r = ref Unsafe.Add(ref rBase, i);
+                    ref Vector256<float> g = ref Unsafe.Add(ref gBase, i);
+                    ref Vector256<float> b = ref Unsafe.Add(ref bBase, i);
+                    r = Avx.Multiply(r, scale);
+                    g = Avx.Multiply(g, scale);
+                    b = Avx.Multiply(b, scale);
                }
 #endif
            }

-            protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) =>
-                FromRgbBasic.ConvertCore(values, result, this.MaximumValue);
+            protected override void ConvertCoreInplace(in ComponentValues values) =>
+                FromRgbBasic.ConvertCoreInplace(values, this.MaximumValue);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromRgbBasic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromRgbBasic.cs
@ -3,6 +3,7 @@

 using System;
 using System.Numerics;
+using System.Runtime.InteropServices;

 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
 {
@ -15,36 +16,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
+            public override void ConvertToRgbInplace(in ComponentValues values)
            {
-                ConvertCore(values, result, this.MaximumValue);
+                ConvertCoreInplace(values, this.MaximumValue);
            }

-            internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue)
+            internal static void ConvertCoreInplace(ComponentValues values, float maxValue)
            {
-                ReadOnlySpan<float> rVals = values.Component0;
-                ReadOnlySpan<float> gVals = values.Component1;
-                ReadOnlySpan<float> bVals = values.Component2;
-
-                var v = new Vector4(0, 0, 0, 1);
-
-                var maximum = 1 / maxValue;
-                var scale = new Vector4(maximum, maximum, maximum, 1F);
-
-                for (int i = 0; i < result.Length; i++)
-                {
-                    float r = rVals[i];
-                    float g = gVals[i];
-                    float b = bVals[i];
-
-                    v.X = r;
-                    v.Y = g;
-                    v.Z = b;
-
-                    v *= scale;
-
-                    result[i] = v;
-                }
+                FromGrayscaleBasic.ScaleValues(values.Component0, maxValue);
+                FromGrayscaleBasic.ScaleValues(values.Component1, maxValue);
+                FromGrayscaleBasic.ScaleValues(values.Component2, maxValue);
            }
        }
    }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromRgbVector8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromRgbVector8.cs
@ -18,50 +18,32 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result)
+            protected override void ConvertCoreVectorizedInplace(in ComponentValues values)
            {
                ref Vector<float> rBase =
-                                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0));
+                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0));
                ref Vector<float> gBase =
-                                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component1));
+                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component1));
                ref Vector<float> bBase =
-                                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component2));
-
-                ref Vector4Octet resultBase =
-                    ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result));
-
-                Vector4Pair rr = default;
-                Vector4Pair gg = default;
-                Vector4Pair bb = default;
-                ref Vector<float> rrRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref rr);
-                ref Vector<float> ggRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref gg);
-                ref Vector<float> bbRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref bb);
+                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component2));

                var scale = new Vector<float>(1 / this.MaximumValue);

                // Walking 8 elements at one step:
-                int n = result.Length / 8;
-                for (int i = 0; i < n; i++)
+                nint n = values.Component0.Length / 8;
+                for (nint i = 0; i < n; i++)
                {
-                    Vector<float> r = Unsafe.Add(ref rBase, i);
-                    Vector<float> g = Unsafe.Add(ref gBase, i);
-                    Vector<float> b = Unsafe.Add(ref bBase, i);
+                    ref Vector<float> r = ref Unsafe.Add(ref rBase, i);
+                    ref Vector<float> g = ref Unsafe.Add(ref gBase, i);
+                    ref Vector<float> b = ref Unsafe.Add(ref bBase, i);
                    r *= scale;
                    g *= scale;
                    b *= scale;
-
-                    rrRefAsVector = r;
-                    ggRefAsVector = g;
-                    bbRefAsVector = b;
-
-                    // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
-                    ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
-                    destination.Pack(ref rr, ref gg, ref bb);
                }
            }

-            protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) =>
-                FromRgbBasic.ConvertCore(values, result, this.MaximumValue);
+            protected override void ConvertCoreInplace(in ComponentValues values) =>
+                FromRgbBasic.ConvertCoreInplace(values, this.MaximumValue);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrAvx2.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrAvx2.cs
@ -23,19 +23,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result)
+            protected override void ConvertCoreVectorizedInplace(in ComponentValues values)
            {
-                #if SUPPORTS_RUNTIME_INTRINSICS
-                ref Vector256<float> yBase =
+#if SUPPORTS_RUNTIME_INTRINSICS
+                ref Vector256<float> c0Base =
                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
-                ref Vector256<float> cbBase =
+                ref Vector256<float> c1Base =
                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
-                ref Vector256<float> crBase =
+                ref Vector256<float> c2Base =
                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));

-                ref Vector256<float> resultBase =
-                    ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result));
-
                // Used for the color conversion
                var chromaOffset = Vector256.Create(-this.HalfValue);
                var scale = Vector256.Create(1 / this.MaximumValue);
@ -50,19 +47,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
                Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);

                // Walking 8 elements at one step:
-                int n = result.Length / 8;
-                for (int i = 0; i < n; i++)
+                nint n = values.Component0.Length / 8;
+                for (nint i = 0; i < n; i++)
                {
                    // y = yVals[i];
                    // cb = cbVals[i] - 128F;
                    // cr = crVals[i] - 128F;
-                    Vector256<float> y = Unsafe.Add(ref yBase, i);
-                    Vector256<float> cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset);
-                    Vector256<float> cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset);
+                    ref Vector256<float> c0 = ref Unsafe.Add(ref c0Base, i);
+                    ref Vector256<float> c1 = ref Unsafe.Add(ref c1Base, i);
+                    ref Vector256<float> c2 = ref Unsafe.Add(ref c2Base, i);

-                    y = Avx2.PermuteVar8x32(y, vcontrol);
-                    cb = Avx2.PermuteVar8x32(cb, vcontrol);
-                    cr = Avx2.PermuteVar8x32(cr, vcontrol);
+                    Vector256<float> y = c0;
+                    Vector256<float> cb = Avx.Add(c1, chromaOffset);
+                    Vector256<float> cr = Avx.Add(c2, chromaOffset);

                    // r = y + (1.402F * cr);
                    // g = y - (0.344136F * cb) - (0.714136F * cr);
@ -72,30 +69,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
                    Vector256<float> g = HwIntrinsics.MultiplyAdd(HwIntrinsics.MultiplyAdd(y, cb, gCbMult), cr, gCrMult);
                    Vector256<float> b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult);

-                    // TODO: We should be saving to RGBA not Vector4
                    r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale);
                    g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale);
                    b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale);

-                    Vector256<float> vte = Avx.UnpackLow(r, b);
-                    Vector256<float> vto = Avx.UnpackLow(g, va);
-
-                    ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4);
-
-                    destination = Avx.UnpackLow(vte, vto);
-                    Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto);
-
-                    vte = Avx.UnpackHigh(r, b);
-                    vto = Avx.UnpackHigh(g, va);
-
-                    Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto);
-                    Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto);
+                    c0 = r;
+                    c1 = g;
+                    c2 = b;
                }
 #endif
            }

-            protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) =>
-                FromYCbCrBasic.ConvertCore(values, result, this.MaximumValue, this.HalfValue);
+            protected override void ConvertCoreInplace(in ComponentValues values) =>
+                FromYCbCrBasic.ConvertCoreInplace(values, this.MaximumValue, this.HalfValue);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrBasic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrBasic.cs
@ -15,35 +15,26 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
-            {
-                ConvertCore(values, result, this.MaximumValue, this.HalfValue);
-            }
+            public override void ConvertToRgbInplace(in ComponentValues values)
+                => ConvertCoreInplace(values, this.MaximumValue, this.HalfValue);

-            internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue, float halfValue)
+            internal static void ConvertCoreInplace(in ComponentValues values, float maxValue, float halfValue)
            {
-                // TODO: We can optimize a lot here with Vector<float> and SRCS.Unsafe()!
-                ReadOnlySpan<float> yVals = values.Component0;
-                ReadOnlySpan<float> cbVals = values.Component1;
-                ReadOnlySpan<float> crVals = values.Component2;
-
-                var v = new Vector4(0, 0, 0, 1);
+                Span<float> c0 = values.Component0;
+                Span<float> c1 = values.Component1;
+                Span<float> c2 = values.Component2;

-                var scale = new Vector4(1 / maxValue, 1 / maxValue, 1 / maxValue, 1F);
+                var scale = 1 / maxValue;

-                for (int i = 0; i < result.Length; i++)
+                for (int i = 0; i < c0.Length; i++)
                {
-                    float y = yVals[i];
-                    float cb = cbVals[i] - halfValue;
-                    float cr = crVals[i] - halfValue;
-
-                    v.X = MathF.Round(y + (1.402F * cr), MidpointRounding.AwayFromZero);
-                    v.Y = MathF.Round(y - (0.344136F * cb) - (0.714136F * cr), MidpointRounding.AwayFromZero);
-                    v.Z = MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero);
-
-                    v *= scale;
+                    float y = c0[i];
+                    float cb = c1[i] - halfValue;
+                    float cr = c2[i] - halfValue;

-                    result[i] = v;
+                    c0[i] = MathF.Round(y + (1.402F * cr), MidpointRounding.AwayFromZero) * scale;
+                    c1[i] = MathF.Round(y - (0.344136F * cb) - (0.714136F * cr), MidpointRounding.AwayFromZero) * scale;
+                    c2[i] = MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero) * scale;
                }
            }
        }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrVector4.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrVector4.cs
@ -20,58 +20,54 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters

            protected override bool IsAvailable => SimdUtils.HasVector4;

-            protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result)
+            protected override void ConvertCoreVectorizedInplace(in ComponentValues values)
            {
-                // TODO: Find a way to properly run & test this path on AVX2 PC-s! (Have I already mentioned that Vector<T> is terrible?)
-                DebugGuard.IsTrue(result.Length % 8 == 0, nameof(result), "result.Length should be divisible by 8!");
+                DebugGuard.IsTrue(values.Component0.Length % 8 == 0, nameof(values), "Length should be divisible by 8!");

-                ref Vector4Pair yBase =
+                ref Vector4Pair c0Base =
                    ref Unsafe.As<float, Vector4Pair>(ref MemoryMarshal.GetReference(values.Component0));
-                ref Vector4Pair cbBase =
+                ref Vector4Pair c1Base =
                    ref Unsafe.As<float, Vector4Pair>(ref MemoryMarshal.GetReference(values.Component1));
-                ref Vector4Pair crBase =
+                ref Vector4Pair c2Base =
                    ref Unsafe.As<float, Vector4Pair>(ref MemoryMarshal.GetReference(values.Component2));

-                ref Vector4Octet resultBase =
-                    ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result));
-
                var chromaOffset = new Vector4(-this.HalfValue);
                var maxValue = this.MaximumValue;

                // Walking 8 elements at one step:
-                int n = result.Length / 8;
+                nint n = values.Component0.Length / 8;

-                for (int i = 0; i < n; i++)
+                for (nint i = 0; i < n; i++)
                {
                    // y = yVals[i];
-                    Vector4Pair y = Unsafe.Add(ref yBase, i);
+                    ref Vector4Pair c0 = ref Unsafe.Add(ref c0Base, i);

                    // cb = cbVals[i] - halfValue);
-                    Vector4Pair cb = Unsafe.Add(ref cbBase, i);
-                    cb.AddInplace(chromaOffset);
+                    ref Vector4Pair c1 = ref Unsafe.Add(ref c1Base, i);
+                    c1.AddInplace(chromaOffset);

                    // cr = crVals[i] - halfValue;
-                    Vector4Pair cr = Unsafe.Add(ref crBase, i);
-                    cr.AddInplace(chromaOffset);
+                    ref Vector4Pair c2 = ref Unsafe.Add(ref c2Base, i);
+                    c2.AddInplace(chromaOffset);

                    // r = y + (1.402F * cr);
-                    Vector4Pair r = y;
-                    Vector4Pair tmp = cr;
+                    Vector4Pair r = c0;
+                    Vector4Pair tmp = c2;
                    tmp.MultiplyInplace(1.402F);
                    r.AddInplace(ref tmp);

                    // g = y - (0.344136F * cb) - (0.714136F * cr);
-                    Vector4Pair g = y;
-                    tmp = cb;
+                    Vector4Pair g = c0;
+                    tmp = c1;
                    tmp.MultiplyInplace(-0.344136F);
                    g.AddInplace(ref tmp);
-                    tmp = cr;
+                    tmp = c2;
                    tmp.MultiplyInplace(-0.714136F);
                    g.AddInplace(ref tmp);

                    // b = y + (1.772F * cb);
-                    Vector4Pair b = y;
-                    tmp = cb;
+                    Vector4Pair b = c0;
+                    tmp = c1;
                    tmp.MultiplyInplace(1.772F);
                    b.AddInplace(ref tmp);

@ -79,14 +75,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
                    g.RoundAndDownscalePreVector8(maxValue);
                    b.RoundAndDownscalePreVector8(maxValue);

-                    // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
-                    ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
-                    destination.Pack(ref r, ref g, ref b);
+                    c0 = r;
+                    c1 = g;
+                    c2 = b;
                }
            }

-            protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) =>
-                FromYCbCrBasic.ConvertCore(values, result, this.MaximumValue, this.HalfValue);
+            protected override void ConvertCoreInplace(in ComponentValues values)
+                => FromYCbCrBasic.ConvertCoreInplace(values, this.MaximumValue, this.HalfValue);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrVector8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrVector8.cs
@ -19,41 +19,32 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result)
+            protected override void ConvertCoreVectorizedInplace(in ComponentValues values)
            {
-                ref Vector<float> yBase =
+                ref Vector<float> c0Base =
                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0));
-                ref Vector<float> cbBase =
+                ref Vector<float> c1Base =
                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component1));
-                ref Vector<float> crBase =
+                ref Vector<float> c2Base =
                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component2));

-                ref Vector4Octet resultBase =
-                    ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result));
-
                var chromaOffset = new Vector<float>(-this.HalfValue);

                // Walking 8 elements at one step:
-                int n = result.Length / 8;
-
-                Vector4Pair rr = default;
-                Vector4Pair gg = default;
-                Vector4Pair bb = default;
-
-                ref Vector<float> rrRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref rr);
-                ref Vector<float> ggRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref gg);
-                ref Vector<float> bbRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref bb);
-
+                nint n = values.Component0.Length / 8;
                var scale = new Vector<float>(1 / this.MaximumValue);

-                for (int i = 0; i < n; i++)
+                for (nint i = 0; i < n; i++)
                {
                    // y = yVals[i];
                    // cb = cbVals[i] - 128F;
                    // cr = crVals[i] - 128F;
-                    Vector<float> y = Unsafe.Add(ref yBase, i);
-                    Vector<float> cb = Unsafe.Add(ref cbBase, i) + chromaOffset;
-                    Vector<float> cr = Unsafe.Add(ref crBase, i) + chromaOffset;
+                    ref Vector<float> c0 = ref Unsafe.Add(ref c0Base, i);
+                    ref Vector<float> c1 = ref Unsafe.Add(ref c1Base, i);
+                    ref Vector<float> c2 = ref Unsafe.Add(ref c2Base, i);
+                    Vector<float> y = Unsafe.Add(ref c0Base, i);
+                    Vector<float> cb = Unsafe.Add(ref c1Base, i) + chromaOffset;
+                    Vector<float> cr = Unsafe.Add(ref c2Base, i) + chromaOffset;

                    // r = y + (1.402F * cr);
                    // g = y - (0.344136F * cb) - (0.714136F * cr);
@ -70,18 +61,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
                    g *= scale;
                    b *= scale;

-                    rrRefAsVector = r;
-                    ggRefAsVector = g;
-                    bbRefAsVector = b;
-
-                    // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
-                    ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
-                    destination.Pack(ref rr, ref gg, ref bb);
+                    c0 = r;
+                    c1 = g;
+                    c2 = b;
                }
            }

-            protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) =>
-                FromYCbCrBasic.ConvertCore(values, result, this.MaximumValue, this.HalfValue);
+            protected override void ConvertCoreInplace(in ComponentValues values) =>
+                FromYCbCrBasic.ConvertCoreInplace(values, this.MaximumValue, this.HalfValue);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYccKAvx2.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYccKAvx2.cs
@ -22,52 +22,42 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result)
+            protected override void ConvertCoreVectorizedInplace(in ComponentValues values)
            {
 #if SUPPORTS_RUNTIME_INTRINSICS
-                ref Vector256<float> yBase =
+                ref Vector256<float> c0Base =
                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
-                ref Vector256<float> cbBase =
+                ref Vector256<float> c1Base =
                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
-                ref Vector256<float> crBase =
+                ref Vector256<float> c2Base =
                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
                ref Vector256<float> kBase =
                    ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3));

-                ref Vector256<float> resultBase =
-                    ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result));
-
                // Used for the color conversion
                var chromaOffset = Vector256.Create(-this.HalfValue);
-                var scale = Vector256.Create(1 / this.MaximumValue);
+                var scale = Vector256.Create(1 / (this.MaximumValue * this.MaximumValue));
                var max = Vector256.Create(this.MaximumValue);
                var rCrMult = Vector256.Create(1.402F);
                var gCbMult = Vector256.Create(-0.344136F);
                var gCrMult = Vector256.Create(-0.714136F);
                var bCbMult = Vector256.Create(1.772F);

-                // Used for packing.
-                var va = Vector256.Create(1F);
-                ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32);
-                Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);
-
                // Walking 8 elements at one step:
-                int n = result.Length / 8;
-                for (int i = 0; i < n; i++)
+                nint n = values.Component0.Length / 8;
+                for (nint i = 0; i < n; i++)
                {
                    // y = yVals[i];
                    // cb = cbVals[i] - 128F;
                    // cr = crVals[i] - 128F;
                    // k = kVals[i] / 256F;
-                    Vector256<float> y = Unsafe.Add(ref yBase, i);
-                    Vector256<float> cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset);
-                    Vector256<float> cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset);
-                    Vector256<float> k = Avx.Divide(Unsafe.Add(ref kBase, i), max);
-
-                    y = Avx2.PermuteVar8x32(y, vcontrol);
-                    cb = Avx2.PermuteVar8x32(cb, vcontrol);
-                    cr = Avx2.PermuteVar8x32(cr, vcontrol);
-                    k = Avx2.PermuteVar8x32(k, vcontrol);
+                    ref Vector256<float> c0 = ref Unsafe.Add(ref c0Base, i);
+                    ref Vector256<float> c1 = ref Unsafe.Add(ref c1Base, i);
+                    ref Vector256<float> c2 = ref Unsafe.Add(ref c2Base, i);
+                    Vector256<float> y = c0;
+                    Vector256<float> cb = Avx.Add(c1, chromaOffset);
+                    Vector256<float> cr = Avx.Add(c2, chromaOffset);
+                    Vector256<float> scaledK = Avx.Multiply(Unsafe.Add(ref kBase, i), scale);

                    // r = y + (1.402F * cr);
                    // g = y - (0.344136F * cb) - (0.714136F * cr);
@ -82,29 +72,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
                    g = Avx.Subtract(max, Avx.RoundToNearestInteger(g));
                    b = Avx.Subtract(max, Avx.RoundToNearestInteger(b));

-                    r = Avx.Multiply(Avx.Multiply(r, k), scale);
-                    g = Avx.Multiply(Avx.Multiply(g, k), scale);
-                    b = Avx.Multiply(Avx.Multiply(b, k), scale);
-
-                    Vector256<float> vte = Avx.UnpackLow(r, b);
-                    Vector256<float> vto = Avx.UnpackLow(g, va);
-
-                    ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4);
-
-                    destination = Avx.UnpackLow(vte, vto);
-                    Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto);
-
-                    vte = Avx.UnpackHigh(r, b);
-                    vto = Avx.UnpackHigh(g, va);
+                    r = Avx.Multiply(r, scaledK);
+                    g = Avx.Multiply(g, scaledK);
+                    b = Avx.Multiply(b, scaledK);

-                    Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto);
-                    Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto);
+                    c0 = r;
+                    c1 = g;
+                    c2 = b;
                }
 #endif
            }

-            protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) =>
-                FromYccKBasic.ConvertCore(values, result, this.MaximumValue, this.HalfValue);
+            protected override void ConvertCoreInplace(in ComponentValues values) =>
+                FromYccKBasic.ConvertCoreInplace(values, this.MaximumValue, this.HalfValue);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYccKBasic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYccKBasic.cs
@ -15,39 +15,30 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
-            {
-                ConvertCore(values, result, this.MaximumValue, this.HalfValue);
-            }
+            public override void ConvertToRgbInplace(in ComponentValues values) =>
+                ConvertCoreInplace(values, this.MaximumValue, this.HalfValue);

-            internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue, float halfValue)
+            internal static void ConvertCoreInplace(in ComponentValues values, float maxValue, float halfValue)
            {
-                // TODO: We can optimize a lot here with Vector<float> and SRCS.Unsafe()!
-                ReadOnlySpan<float> yVals = values.Component0;
-                ReadOnlySpan<float> cbVals = values.Component1;
-                ReadOnlySpan<float> crVals = values.Component2;
-                ReadOnlySpan<float> kVals = values.Component3;
+                Span<float> c0 = values.Component0;
+                Span<float> c1 = values.Component1;
+                Span<float> c2 = values.Component2;
+                Span<float> c3 = values.Component3;

                var v = new Vector4(0, 0, 0, 1F);

-                var maximum = 1 / maxValue;
-                var scale = new Vector4(maximum, maximum, maximum, 1F);
+                var scale = 1 / (maxValue * maxValue);

-                for (int i = 0; i < result.Length; i++)
+                for (int i = 0; i < values.Component0.Length; i++)
                {
-                    float y = yVals[i];
-                    float cb = cbVals[i] - halfValue;
-                    float cr = crVals[i] - halfValue;
-                    float k = kVals[i] / maxValue;
-
-                    v.X = (maxValue - MathF.Round(y + (1.402F * cr), MidpointRounding.AwayFromZero)) * k;
-                    v.Y = (maxValue - MathF.Round(y - (0.344136F * cb) - (0.714136F * cr), MidpointRounding.AwayFromZero)) * k;
-                    v.Z = (maxValue - MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero)) * k;
-                    v.W = 1F;
-
-                    v *= scale;
-
-                    result[i] = v;
+                    float y = c0[i];
+                    float cb = c1[i] - halfValue;
+                    float cr = c2[i] - halfValue;
+                    float scaledK = c3[i] * scale;
+
+                    c0[i] = (maxValue - MathF.Round(y + (1.402F * cr), MidpointRounding.AwayFromZero)) * scaledK;
+                    c1[i] = (maxValue - MathF.Round(y - (0.344136F * cb) - (0.714136F * cr), MidpointRounding.AwayFromZero)) * scaledK;
+                    c2[i] = (maxValue - MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero)) * scaledK;
                }
            }
        }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYccKVector8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYccKVector8.cs
@ -18,46 +18,39 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            {
            }

-            protected override void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result)
+            protected override void ConvertCoreVectorizedInplace(in ComponentValues values)
            {
-                ref Vector<float> yBase =
+                ref Vector<float> c0Base =
                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0));
-                ref Vector<float> cbBase =
+                ref Vector<float> c1Base =
                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component1));
-                ref Vector<float> crBase =
+                ref Vector<float> c2Base =
                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component2));
                ref Vector<float> kBase =
                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component3));

-                ref Vector4Octet resultBase =
-                    ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result));
-
                var chromaOffset = new Vector<float>(-this.HalfValue);

                // Walking 8 elements at one step:
-                int n = result.Length / 8;
-
-                Vector4Pair rr = default;
-                Vector4Pair gg = default;
-                Vector4Pair bb = default;
-
-                ref Vector<float> rrRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref rr);
-                ref Vector<float> ggRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref gg);
-                ref Vector<float> bbRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref bb);
+                nint n = values.Component0.Length / 8;

-                var scale = new Vector<float>(1 / this.MaximumValue);
                var max = new Vector<float>(this.MaximumValue);
+                var scale = new Vector<float>(1f) / (max * max);

-                for (int i = 0; i < n; i++)
+                for (nint i = 0; i < n; i++)
                {
                    // y = yVals[i];
                    // cb = cbVals[i] - 128F;
                    // cr = crVals[i] - 128F;
                    // k = kVals[i] / 256F;
-                    Vector<float> y = Unsafe.Add(ref yBase, i);
-                    Vector<float> cb = Unsafe.Add(ref cbBase, i) + chromaOffset;
-                    Vector<float> cr = Unsafe.Add(ref crBase, i) + chromaOffset;
-                    Vector<float> k = Unsafe.Add(ref kBase, i) / max;
+                    ref Vector<float> c0 = ref Unsafe.Add(ref c0Base, i);
+                    ref Vector<float> c1 = ref Unsafe.Add(ref c1Base, i);
+                    ref Vector<float> c2 = ref Unsafe.Add(ref c2Base, i);
+
+                    Vector<float> y = c0;
+                    Vector<float> cb = c1 + chromaOffset;
+                    Vector<float> cr = c2 + chromaOffset;
+                    Vector<float> scaledK = Unsafe.Add(ref kBase, i) * scale;

                    // r = y + (1.402F * cr);
                    // g = y - (0.344136F * cb) - (0.714136F * cr);
@ -67,25 +60,18 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
                    Vector<float> g = y - (cb * new Vector<float>(0.344136F)) - (cr * new Vector<float>(0.714136F));
                    Vector<float> b = y + (cb * new Vector<float>(1.772F));

-                    r = (max - r.FastRound()) * k;
-                    g = (max - g.FastRound()) * k;
-                    b = (max - b.FastRound()) * k;
-                    r *= scale;
-                    g *= scale;
-                    b *= scale;
-
-                    rrRefAsVector = r;
-                    ggRefAsVector = g;
-                    bbRefAsVector = b;
+                    r = (max - r.FastRound()) * scaledK;
+                    g = (max - g.FastRound()) * scaledK;
+                    b = (max - b.FastRound()) * scaledK;

-                    // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
-                    ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
-                    destination.Pack(ref rr, ref gg, ref bb);
+                    c0 = r;
+                    c1 = g;
+                    c2 = b;
                }
            }

-            protected override void ConvertCore(in ComponentValues values, Span<Vector4> result) =>
-                FromYccKBasic.ConvertCore(values, result, this.MaximumValue, this.HalfValue);
+            protected override void ConvertCoreInplace(in ComponentValues values) =>
+                FromYccKBasic.ConvertCoreInplace(values, this.MaximumValue, this.HalfValue);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.VectorizedJpegColorConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.VectorizedJpegColorConverter.cs
@ -18,10 +18,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
                this.vectorSize = vectorSize;
            }

-            public sealed override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
+            public override void ConvertToRgbInplace(in ComponentValues values)
            {
-                int remainder = result.Length % this.vectorSize;
-                int simdCount = result.Length - remainder;
+                int length = values.Component0.Length;
+                int remainder = values.Component0.Length % this.vectorSize;
+                int simdCount = length - remainder;
                if (simdCount > 0)
                {
                    // This implementation is actually AVX specific.
@ -32,15 +33,15 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
                            "This converter can be used only on architecture having 256 byte floating point SIMD registers!");
                    }

-                    this.ConvertCoreVectorized(values.Slice(0, simdCount), result.Slice(0, simdCount));
+                    this.ConvertCoreVectorizedInplace(values.Slice(0, simdCount));
                }

-                this.ConvertCore(values.Slice(simdCount, remainder), result.Slice(simdCount, remainder));
+                this.ConvertCoreInplace(values.Slice(simdCount, remainder));
            }

-            protected abstract void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result);
+            protected virtual void ConvertCoreVectorizedInplace(in ComponentValues values) => throw new NotImplementedException();

-            protected abstract void ConvertCore(in ComponentValues values, Span<Vector4> result);
+            protected virtual void ConvertCoreInplace(in ComponentValues values) => throw new NotImplementedException();
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
@ -76,11 +76,10 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
        }

        /// <summary>
-        /// He implementation of the conversion.
+        /// Converts planar jpeg component values in <paramref name="values"/> to RGB color space inplace.
        /// </summary>
-        /// <param name="values">The input as a stack-only <see cref="ComponentValues"/> struct</param>
-        /// <param name="result">The destination buffer of <see cref="Vector4"/> values</param>
-        public abstract void ConvertToRgba(in ComponentValues values, Span<Vector4> result);
+        /// <param name="values">The input/ouptut as a stack-only <see cref="ComponentValues"/> struct</param>
+        public abstract void ConvertToRgbInplace(in ComponentValues values);

        /// <summary>
        /// Returns the <see cref="JpegColorConverter"/>s for all supported colorspaces and precisions.
@ -181,22 +180,22 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
            /// <summary>
            /// The component 0 (eg. Y)
            /// </summary>
-            public readonly ReadOnlySpan<float> Component0;
+            public readonly Span<float> Component0;

            /// <summary>
-            /// The component 1 (eg. Cb)
+            /// The component 1 (eg. Cb). In case of grayscale, it points to <see cref="Component0"/>.
            /// </summary>
-            public readonly ReadOnlySpan<float> Component1;
+            public readonly Span<float> Component1;

            /// <summary>
-            /// The component 2 (eg. Cr)
+            /// The component 2 (eg. Cr). In case of grayscale, it points to <see cref="Component0"/>.
            /// </summary>
-            public readonly ReadOnlySpan<float> Component2;
+            public readonly Span<float> Component2;

            /// <summary>
            /// The component 4
            /// </summary>
-            public readonly ReadOnlySpan<float> Component3;
+            public readonly Span<float> Component3;

            /// <summary>
            /// Initializes a new instance of the <see cref="ComponentValues"/> struct.
@ -208,30 +207,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
                this.ComponentCount = componentBuffers.Count;

                this.Component0 = componentBuffers[0].GetRowSpan(row);
-                this.Component1 = Span<float>.Empty;
-                this.Component2 = Span<float>.Empty;
-                this.Component3 = Span<float>.Empty;
-
-                if (this.ComponentCount > 1)
-                {
-                    this.Component1 = componentBuffers[1].GetRowSpan(row);
-                    if (this.ComponentCount > 2)
-                    {
-                        this.Component2 = componentBuffers[2].GetRowSpan(row);
-                        if (this.ComponentCount > 3)
-                        {
-                            this.Component3 = componentBuffers[3].GetRowSpan(row);
-                        }
-                    }
-                }
+
+                // In case of grayscale, Component1 and Component2 point to Component0 memory area
+                this.Component1 = this.ComponentCount > 1 ? componentBuffers[1].GetRowSpan(row) : this.Component0;
+                this.Component2 = this.ComponentCount > 2 ? componentBuffers[2].GetRowSpan(row) : this.Component0;
+                this.Component3 = this.ComponentCount > 3 ? componentBuffers[3].GetRowSpan(row) : Span<float>.Empty;
            }

-            private ComponentValues(
+            internal ComponentValues(
                int componentCount,
-                ReadOnlySpan<float> c0,
-                ReadOnlySpan<float> c1,
-                ReadOnlySpan<float> c2,
-                ReadOnlySpan<float> c3)
+                Span<float> c0,
+                Span<float> c1,
+                Span<float> c2,
+                Span<float> c3)
            {
                this.ComponentCount = componentCount;
                this.Component0 = c0;
@ -242,111 +230,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters

            public ComponentValues Slice(int start, int length)
            {
-                ReadOnlySpan<float> c0 = this.Component0.Slice(start, length);
-                ReadOnlySpan<float> c1 = this.ComponentCount > 1 ? this.Component1.Slice(start, length) : ReadOnlySpan<float>.Empty;
-                ReadOnlySpan<float> c2 = this.ComponentCount > 2 ? this.Component2.Slice(start, length) : ReadOnlySpan<float>.Empty;
-                ReadOnlySpan<float> c3 = this.ComponentCount > 3 ? this.Component3.Slice(start, length) : ReadOnlySpan<float>.Empty;
+                Span<float> c0 = this.Component0.Slice(start, length);
+                Span<float> c1 = this.Component1.Length > 0 ? this.Component1.Slice(start, length) : Span<float>.Empty;
+                Span<float> c2 = this.Component2.Length > 0 ? this.Component2.Slice(start, length) : Span<float>.Empty;
+                Span<float> c3 = this.Component3.Length > 0 ? this.Component3.Slice(start, length) : Span<float>.Empty;

                return new ComponentValues(this.ComponentCount, c0, c1, c2, c3);
            }
        }
-
-        internal struct Vector4Octet
-        {
-#pragma warning disable SA1132 // Do not combine fields
-            public Vector4 V0, V1, V2, V3, V4, V5, V6, V7;
-
-            /// <summary>
-            /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ...
-            /// </summary>
-            public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b)
-            {
-                this.V0.X = r.A.X;
-                this.V0.Y = g.A.X;
-                this.V0.Z = b.A.X;
-                this.V0.W = 1f;
-
-                this.V1.X = r.A.Y;
-                this.V1.Y = g.A.Y;
-                this.V1.Z = b.A.Y;
-                this.V1.W = 1f;
-
-                this.V2.X = r.A.Z;
-                this.V2.Y = g.A.Z;
-                this.V2.Z = b.A.Z;
-                this.V2.W = 1f;
-
-                this.V3.X = r.A.W;
-                this.V3.Y = g.A.W;
-                this.V3.Z = b.A.W;
-                this.V3.W = 1f;
-
-                this.V4.X = r.B.X;
-                this.V4.Y = g.B.X;
-                this.V4.Z = b.B.X;
-                this.V4.W = 1f;
-
-                this.V5.X = r.B.Y;
-                this.V5.Y = g.B.Y;
-                this.V5.Z = b.B.Y;
-                this.V5.W = 1f;
-
-                this.V6.X = r.B.Z;
-                this.V6.Y = g.B.Z;
-                this.V6.Z = b.B.Z;
-                this.V6.W = 1f;
-
-                this.V7.X = r.B.W;
-                this.V7.Y = g.B.W;
-                this.V7.Z = b.B.W;
-                this.V7.W = 1f;
-            }
-
-            /// <summary>
-            /// Pack (g0,g1...g7) vector values as (g0,g0,g0,1), (g1,g1,g1,1) ...
-            /// </summary>
-            public void Pack(ref Vector4Pair g)
-            {
-                this.V0.X = g.A.X;
-                this.V0.Y = g.A.X;
-                this.V0.Z = g.A.X;
-                this.V0.W = 1f;
-
-                this.V1.X = g.A.Y;
-                this.V1.Y = g.A.Y;
-                this.V1.Z = g.A.Y;
-                this.V1.W = 1f;
-
-                this.V2.X = g.A.Z;
-                this.V2.Y = g.A.Z;
-                this.V2.Z = g.A.Z;
-                this.V2.W = 1f;
-
-                this.V3.X = g.A.W;
-                this.V3.Y = g.A.W;
-                this.V3.Z = g.A.W;
-                this.V3.W = 1f;
-
-                this.V4.X = g.B.X;
-                this.V4.Y = g.B.X;
-                this.V4.Z = g.B.X;
-                this.V4.W = 1f;
-
-                this.V5.X = g.B.Y;
-                this.V5.Y = g.B.Y;
-                this.V5.Z = g.B.Y;
-                this.V5.W = 1f;
-
-                this.V6.X = g.B.Z;
-                this.V6.Y = g.B.Z;
-                this.V6.Z = g.B.Z;
-                this.V6.W = 1f;
-
-                this.V7.X = g.B.W;
-                this.V7.Y = g.B.W;
-                this.V7.Z = g.B.W;
-                this.V7.W = 1f;
-            }
-        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanBuffer.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanBuffer.cs
@ -80,7 +80,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        [MethodImpl(InliningOptions.ShortMethod)]
        public bool HasBadMarker() => this.Marker != JpegConstants.Markers.XFF && !this.HasRestartMarker();

-        [MethodImpl(InliningOptions.ShortMethod)]
+        [MethodImpl(InliningOptions.AlwaysInline)]
        public void FillBuffer()
        {
            // Attempt to load at least the minimum number of required bits into the buffer.
@ -130,7 +130,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        [MethodImpl(InliningOptions.ShortMethod)]
        public int PeekBits(int nbits) => (int)ExtractBits(this.data, this.remainingBits - nbits, nbits);

-        [MethodImpl(InliningOptions.ShortMethod)]
+        [MethodImpl(InliningOptions.AlwaysInline)]
        private static ulong ExtractBits(ulong value, int offset, int size) => (value >> offset) & (ulong)((1 << size) - 1);

        [MethodImpl(InliningOptions.ShortMethod)]
@ -207,7 +207,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            }
        }

-        [MethodImpl(InliningOptions.ShortMethod)]
+        [MethodImpl(InliningOptions.AlwaysInline)]
        private int ReadStream()
        {
            int value = this.badData ? 0 : this.stream.ReadByte();
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
@ -38,10 +38,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        /// </summary>
        private int restartInterval;

-        // How many mcu's are left to do.
+        /// <summary>
+        /// How many mcu's are left to do.
+        /// </summary>
        private int todo;

-        // The End-Of-Block countdown for ending the sequence prematurely when the remaining coefficients are zero.
+        /// <summary>
+        /// The End-Of-Block countdown for ending the sequence prematurely when the remaining coefficients are zero.
+        /// </summary>
        private int eobrun;

        /// <summary>
@ -54,14 +58,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        /// </summary>
        private readonly HuffmanTable[] acHuffmanTables;

-        // The unzig data.
-        private ZigZag dctZigZag;
-
        private HuffmanScanBuffer scanBuffer;

        private readonly SpectralConverter spectralConverter;

-        private CancellationToken cancellationToken;
+        private readonly CancellationToken cancellationToken;

        /// <summary>
        /// Initializes a new instance of the <see cref="HuffmanScanDecoder"/> class.
@ -74,7 +75,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            SpectralConverter converter,
            CancellationToken cancellationToken)
        {
-            this.dctZigZag = ZigZag.CreateUnzigTable();
            this.stream = stream;
            this.spectralConverter = converter;
            this.cancellationToken = cancellationToken;
@ -477,7 +477,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        {
            ref short blockDataRef = ref Unsafe.As<Block8x8, short>(ref block);
            ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-            ref ZigZag zigzag = ref this.dctZigZag;

            // DC
            int t = buffer.DecodeHuffman(ref dcTable);
@ -502,7 +501,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                {
                    i += r;
                    s = buffer.Receive(s);
-                    Unsafe.Add(ref blockDataRef, zigzag[i++]) = (short)s;
+                    Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s;
                }
                else
                {
@ -556,7 +555,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                }

                ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-                ref ZigZag zigzag = ref this.dctZigZag;
                int start = this.SpectralStart;
                int end = this.SpectralEnd;
                int low = this.SuccessiveLow;
@ -572,7 +570,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                    if (s != 0)
                    {
                        s = buffer.Receive(s);
-                        Unsafe.Add(ref blockDataRef, zigzag[i]) = (short)(s << low);
+                        Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low);
                    }
                    else
                    {
@ -602,7 +600,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        {
            // Refinement scan for these AC coefficients
            ref HuffmanScanBuffer buffer = ref this.scanBuffer;
-            ref ZigZag zigzag = ref this.dctZigZag;
            int start = this.SpectralStart;
            int end = this.SpectralEnd;

@ -649,7 +646,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder

                    do
                    {
-                        ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]);
+                        ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
                        if (coef != 0)
                        {
                            buffer.CheckBits();
@ -675,7 +672,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder

                    if ((s != 0) && (k < 64))
                    {
-                        Unsafe.Add(ref blockDataRef, zigzag[k]) = (short)s;
+                        Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s;
                    }
                }
            }
@ -684,7 +681,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            {
                for (; k <= end; k++)
                {
-                    ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]);
+                    ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);

                    if (coef != 0)
                    {
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IJpegComponent.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IJpegComponent.cs
@ -45,4 +45,4 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        /// </summary>
        Buffer2D<Block8x8> SpectralBlocks { get; }
    }
-}
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
@ -22,7 +22,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        IJpegComponent[] Components { get; }

        /// <summary>
-        /// Gets the quantization tables, in zigzag order.
+        /// Gets the quantization tables, in natural order.
        /// </summary>
        Block8x8F[] QuantizationTables { get; }
    }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JFifMarker.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JFifMarker.cs
@ -125,4 +125,4 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                this.YDensity);
        }
    }
-}
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@ -19,14 +19,9 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        public Block8x8F SourceBlock;

        /// <summary>
-        /// Temporal block 1 to store intermediate and/or final computation results.
+        /// Temporal block to store intermediate computation results.
        /// </summary>
-        public Block8x8F WorkspaceBlock1;
-
-        /// <summary>
-        /// Temporal block 2 to store intermediate and/or final computation results.
-        /// </summary>
-        public Block8x8F WorkspaceBlock2;
+        public Block8x8F WorkspaceBlock;

        /// <summary>
        /// The quantization table as <see cref="Block8x8F"/>.
@ -46,12 +41,11 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        public JpegBlockPostProcessor(IRawJpegData decoder, IJpegComponent component)
        {
            int qtIndex = component.QuantizationTableIndex;
-            this.DequantiazationTable = ZigZag.CreateDequantizationTable(ref decoder.QuantizationTables[qtIndex]);
+            this.DequantiazationTable = decoder.QuantizationTables[qtIndex];
            this.subSamplingDivisors = component.SubSamplingDivisors;

            this.SourceBlock = default;
-            this.WorkspaceBlock1 = default;
-            this.WorkspaceBlock2 = default;
+            this.WorkspaceBlock = default;
        }

        /// <summary>
@ -71,20 +65,20 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            int destAreaStride,
            float maximumValue)
        {
-            ref Block8x8F b = ref this.SourceBlock;
-            b.LoadFrom(ref sourceBlock);
+            ref Block8x8F block = ref this.SourceBlock;
+            block.LoadFrom(ref sourceBlock);

            // Dequantize:
-            b.MultiplyInPlace(ref this.DequantiazationTable);
+            block.MultiplyInPlace(ref this.DequantiazationTable);

-            FastFloatingPointDCT.TransformIDCT(ref b, ref this.WorkspaceBlock1, ref this.WorkspaceBlock2);
+            FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock);

            // To conform better to libjpeg we actually NEED TO loose precision here.
            // This is because they store blocks as Int16 between all the operations.
            // To be "more accurate", we need to emulate this by rounding!
-            this.WorkspaceBlock1.NormalizeColorsAndRoundInPlace(maximumValue);
+            block.NormalizeColorsAndRoundInPlace(maximumValue);

-            this.WorkspaceBlock1.ScaledCopyTo(
+            block.ScaledCopyTo(
                ref destAreaOrigin,
                destAreaStride,
                this.subSamplingDivisors.Width,
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegColorSpace.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegColorSpace.cs
@ -20,4 +20,4 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder

        YCbCr
    }
-}
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegComponent.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegComponent.cs
@ -2,7 +2,6 @@
 // Licensed under the Apache License, Version 2.0.

 using System;
-
 using SixLabors.ImageSharp.Memory;

 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegFileMarker.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegFileMarker.cs
@ -66,4 +66,4 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            return this.Marker.ToString("X");
        }
    }
-}
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
@ -1,6 +1,8 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

+using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters;
+
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
 {
    /// <summary>
@ -30,5 +32,13 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
        /// Actual stride height depends on the subsampling factor of the given component.
        /// </remarks>
        public abstract void ConvertStrideBaseline();
+
+        /// <summary>
+        /// Gets the color converter.
+        /// </summary>
+        /// <param name="frame">The jpeg frame with the color space to convert to.</param>
+        /// <param name="jpegData">The raw JPEG data.</param>
+        /// <returns>The color converter.</returns>
+        protected virtual JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision);
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter{TPixel}.cs
@ -11,18 +11,21 @@ using SixLabors.ImageSharp.PixelFormats;

 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
 {
-    internal sealed class SpectralConverter<TPixel> : SpectralConverter, IDisposable
+    internal class SpectralConverter<TPixel> : SpectralConverter, IDisposable
        where TPixel : unmanaged, IPixel<TPixel>
    {
        private readonly Configuration configuration;

-        private CancellationToken cancellationToken;
+        private readonly CancellationToken cancellationToken;

        private JpegComponentPostProcessor[] componentProcessors;

        private JpegColorConverter colorConverter;

-        private IMemoryOwner<Vector4> rgbaBuffer;
+        // private IMemoryOwner<Vector4> rgbaBuffer;
+        private IMemoryOwner<byte> rgbBuffer;
+
+        private IMemoryOwner<TPixel> paddedProxyPixelRow;

        private Buffer2D<TPixel> pixelBuffer;

@ -40,25 +43,23 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder

        private bool Converted => this.pixelRowCounter >= this.pixelBuffer.Height;

-        public Buffer2D<TPixel> PixelBuffer
+        public Buffer2D<TPixel> GetPixelBuffer()
        {
-            get
+            if (!this.Converted)
            {
-                if (!this.Converted)
-                {
-                    int steps = (int)Math.Ceiling(this.pixelBuffer.Height / (float)this.pixelRowsPerStep);
+                int steps = (int)Math.Ceiling(this.pixelBuffer.Height / (float)this.pixelRowsPerStep);

-                    for (int step = 0; step < steps; step++)
-                    {
-                        this.cancellationToken.ThrowIfCancellationRequested();
-                        this.ConvertNextStride(step);
-                    }
+                for (int step = 0; step < steps; step++)
+                {
+                    this.cancellationToken.ThrowIfCancellationRequested();
+                    this.ConvertNextStride(step);
                }
-
-                return this.pixelBuffer;
            }
+
+            return this.pixelBuffer;
        }

+        /// <inheritdoc/>
        public override void InjectFrameData(JpegFrame frame, IRawJpegData jpegData)
        {
            MemoryAllocator allocator = this.configuration.MemoryAllocator;
@ -71,7 +72,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            this.pixelRowsPerStep = this.blockRowsPerStep * blockPixelHeight;

            // pixel buffer for resulting image
-            this.pixelBuffer = allocator.Allocate2D<TPixel>(frame.PixelWidth, frame.PixelHeight, AllocationOptions.Clean);
+            this.pixelBuffer = allocator.Allocate2D<TPixel>(frame.PixelWidth, frame.PixelHeight);
+            this.paddedProxyPixelRow = allocator.Allocate<TPixel>(frame.PixelWidth + 3);

            // component processors from spectral to Rgba32
            var postProcessorBufferSize = new Size(c0.SizeInBlocks.Width * 8, this.pixelRowsPerStep);
@ -82,12 +84,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
            }

            // single 'stride' rgba32 buffer for conversion between spectral and TPixel
-            this.rgbaBuffer = allocator.Allocate<Vector4>(frame.PixelWidth);
+            // this.rgbaBuffer = allocator.Allocate<Vector4>(frame.PixelWidth);
+            this.rgbBuffer = allocator.Allocate<byte>(frame.PixelWidth * 3);

            // color converter from Rgba32 to TPixel
-            this.colorConverter = JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision);
+            this.colorConverter = this.GetColorConverter(frame, jpegData);
        }

+        /// <inheritdoc/>
        public override void ConvertStrideBaseline()
        {
            // Convert next pixel stride using single spectral `stride'
@ -113,7 +117,8 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                }
            }

-            this.rgbaBuffer?.Dispose();
+            this.rgbBuffer?.Dispose();
+            this.paddedProxyPixelRow?.Dispose();
        }

        private void ConvertNextStride(int spectralStep)
@ -127,17 +132,38 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder
                buffers[i] = this.componentProcessors[i].ColorBuffer;
            }

+            int width = this.pixelBuffer.Width;
+
            for (int yy = this.pixelRowCounter; yy < maxY; yy++)
            {
                int y = yy - this.pixelRowCounter;

                var values = new JpegColorConverter.ComponentValues(buffers, y);
-                this.colorConverter.ConvertToRgba(values, this.rgbaBuffer.GetSpan());

-                Span<TPixel> destRow = this.pixelBuffer.GetRowSpan(yy);
+                this.colorConverter.ConvertToRgbInplace(values);
+                values = values.Slice(0, width); // slice away Jpeg padding

-                // TODO: Investigate if slicing is actually necessary
-                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, this.rgbaBuffer.GetSpan().Slice(0, destRow.Length), destRow);
+                Span<byte> r = this.rgbBuffer.Slice(0, width);
+                Span<byte> g = this.rgbBuffer.Slice(width, width);
+                Span<byte> b = this.rgbBuffer.Slice(width * 2, width);
+
+                SimdUtils.NormalizedFloatToByteSaturate(values.Component0, r);
+                SimdUtils.NormalizedFloatToByteSaturate(values.Component1, g);
+                SimdUtils.NormalizedFloatToByteSaturate(values.Component2, b);
+
+                // PackFromRgbPlanes expects the destination to be padded, so try to get padded span containing extra elements from the next row.
+                // If we can't get such a padded row because we are on a MemoryGroup boundary or at the last row,
+                // pack pixels to a temporary, padded proxy buffer, then copy the relevant values to the destination row.
+                if (this.pixelBuffer.TryGetPaddedRowSpan(yy, 3, out Span<TPixel> destRow))
+                {
+                    PixelOperations<TPixel>.Instance.PackFromRgbPlanes(this.configuration, r, g, b, destRow);
+                }
+                else
+                {
+                    Span<TPixel> proxyRow = this.paddedProxyPixelRow.GetSpan();
+                    PixelOperations<TPixel>.Instance.PackFromRgbPlanes(this.configuration, r, g, b, proxyRow);
+                    proxyRow.Slice(0, width).CopyTo(this.pixelBuffer.GetRowSpan(yy));
+                }
            }

            this.pixelRowCounter += this.pixelRowsPerStep;
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffIndex.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffIndex.cs
@ -32,4 +32,4 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder

        // ReSharper restore UnusedMember.Local
    }
-}
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
@ -5,10 +5,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
    /// <summary>
    /// A compiled look-up table representation of a huffmanSpec.
-    /// Each value maps to a int32 of which the 24 most significant bits hold the
-    /// codeword in bits and the 8 least significant bits hold the codeword size.
    /// The maximum codeword size is 16 bits.
    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// Each value maps to a int32 of which the 24 most significant bits hold the
+    /// codeword in bits and the 8 least significant bits hold the codeword size.
+    /// </para>
+    /// <para>
+    /// Code value occupies 24 most significant bits as integer value.
+    /// This value is shifted to the MSB position for performance reasons.
+    /// For example, decimal value 10 is stored like this:
+    /// <code>
+    /// MSB                                LSB
+    /// 1010 0000 00000000 00000000 | 00000100
+    /// </code>
+    /// This was done to eliminate extra binary shifts in the encoder.
+    /// While code length is represented as 8 bit integer value
+    /// </para>
+    /// </remarks>
    internal readonly struct HuffmanLut
    {
        /// <summary>
@ -54,7 +69,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                int len = i + 1;
                for (int j = 0; j < spec.Count[i]; j++)
                {
-                    this.Values[spec.Values[k]] = len | (code << 8);
+                    this.Values[spec.Values[k]] = len | (code << (32 - len));
                    code++;
                    k++;
                }
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@ -1,12 +1,11 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

+using System;
 using System.IO;
+using System.Numerics;
 using System.Runtime.CompilerServices;
-#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-#endif
+using System.Runtime.InteropServices;
 using System.Threading;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
@ -16,67 +15,134 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
    internal class HuffmanScanEncoder
    {
        /// <summary>
-        /// Compiled huffman tree to encode given values.
+        /// Maximum number of bytes encoded jpeg 8x8 block can occupy.
+        /// It's highly unlikely for block to occupy this much space - it's a theoretical limit.
        /// </summary>
-        /// <remarks>Yields codewords by index consisting of [run length | bitsize].</remarks>
-        private HuffmanLut[] huffmanTables;
+        /// <remarks>
+        /// Where 16 is maximum huffman code binary length according to itu
+        /// specs. 10 is maximum value binary length, value comes from discrete
+        /// cosine tranform with value range: [-1024..1023]. Block stores
+        /// 8x8 = 64 values thus multiplication by 64. Then divided by 8 to get
+        /// the number of bytes. This value is then multiplied by
+        /// <see cref="MaxBytesPerBlockMultiplier"/> for performance reasons.
+        /// </remarks>
+        private const int MaxBytesPerBlock = (16 + 10) * 64 / 8 * MaxBytesPerBlockMultiplier;
+
+        /// <summary>
+        /// Multiplier used within cache buffers size calculation.
+        /// </summary>
+        /// <remarks>
+        /// <para>
+        /// Theoretically, <see cref="MaxBytesPerBlock"/> bytes buffer can fit
+        /// exactly one minimal coding unit. In reality, coding blocks occupy much
+        /// less space than the theoretical maximum - this can be exploited.
+        /// If temporal buffer size is multiplied by at least 2, second half of
+        /// the resulting buffer will be used as an overflow 'guard' if next
+        /// block would occupy maximum number of bytes. While first half may fit
+        /// many blocks before needing to flush.
+        /// </para>
+        /// <para>
+        /// This is subject to change. This can be equal to 1 but recomended
+        /// value is 2 or even greater - futher benchmarking needed.
+        /// </para>
+        /// </remarks>
+        private const int MaxBytesPerBlockMultiplier = 2;

        /// <summary>
-        /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count).
+        /// <see cref="streamWriteBuffer"/> size multiplier.
        /// </summary>
        /// <remarks>
-        /// This is subject to change, 1024 seems to be the best value in terms of performance.
-        /// <see cref="Emit(int, int)"/> expects it to be at least 8 (see comments in method body).
+        /// Jpeg specification requiers to insert 'stuff' bytes after each
+        /// 0xff byte value. Worst case scenarion is when all bytes are 0xff.
+        /// While it's highly unlikely (if not impossible) to get such
+        /// combination, it's theoretically possible so buffer size must be guarded.
        /// </remarks>
-        private const int EmitBufferSizeInBytes = 1024;
+        private const int OutputBufferLengthMultiplier = 2;

        /// <summary>
-        /// A buffer for reducing the number of stream writes when emitting Huffman tables.
+        /// Compiled huffman tree to encode given values.
        /// </summary>
-        private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes];
+        /// <remarks>Yields codewords by index consisting of [run length | bitsize].</remarks>
+        private HuffmanLut[] huffmanTables;

        /// <summary>
-        /// Number of filled bytes in <see cref="emitBuffer"/> buffer
+        /// Emitted bits 'micro buffer' before being transferred to the <see cref="emitBuffer"/>.
        /// </summary>
-        private int emitLen = 0;
+        private uint accumulatedBits;

        /// <summary>
-        /// Emmited bits 'micro buffer' before being transfered to the <see cref="emitBuffer"/>.
+        /// Buffer for temporal storage of huffman rle encoding bit data.
        /// </summary>
-        private int accumulatedBits;
+        /// <remarks>
+        /// Encoding bits are assembled to 4 byte unsigned integers and then copied to this buffer.
+        /// This process does NOT include inserting stuff bytes.
+        /// </remarks>
+        private readonly uint[] emitBuffer;
+
+        /// <summary>
+        /// Buffer for temporal storage which is then written to the output stream.
+        /// </summary>
+        /// <remarks>
+        /// Encoding bits from <see cref="emitBuffer"/> are copied to this byte buffer including stuff bytes.
+        /// </remarks>
+        private readonly byte[] streamWriteBuffer;

        /// <summary>
        /// Number of jagged bits stored in <see cref="accumulatedBits"/>
        /// </summary>
        private int bitCount;

-        private Block8x8F temporalBlock1;
-        private Block8x8F temporalBlock2;
+        private int emitWriteIndex;
+
+        private Block8x8 tempBlock;

        /// <summary>
        /// The output stream. All attempted writes after the first error become no-ops.
        /// </summary>
        private readonly Stream target;

-        public HuffmanScanEncoder(Stream outputStream)
+        /// <summary>
+        /// Initializes a new instance of the <see cref="HuffmanScanEncoder"/> class.
+        /// </summary>
+        /// <param name="blocksPerCodingUnit">Amount of encoded 8x8 blocks per single jpeg macroblock.</param>
+        /// <param name="outputStream">Output stream for saving encoded data.</param>
+        public HuffmanScanEncoder(int blocksPerCodingUnit, Stream outputStream)
        {
+            int emitBufferByteLength = MaxBytesPerBlock * blocksPerCodingUnit;
+            this.emitBuffer = new uint[emitBufferByteLength / sizeof(uint)];
+            this.emitWriteIndex = this.emitBuffer.Length;
+
+            this.streamWriteBuffer = new byte[emitBufferByteLength * OutputBufferLengthMultiplier];
+
            this.target = outputStream;
        }

+        /// <summary>
+        /// Gets a value indicating whether <see cref="emitBuffer"/> is full
+        /// and must be flushed using <see cref="FlushToStream()"/>
+        /// before encoding next 8x8 coding block.
+        /// </summary>
+        private bool IsStreamFlushNeeded
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => this.emitWriteIndex < (uint)this.emitBuffer.Length / 2;
+        }
+
        /// <summary>
        /// Encodes the image with no subsampling.
        /// </summary>
        /// <typeparam name="TPixel">The pixel format.</typeparam>
        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
-        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
-        /// <param name="chrominanceQuantTable">Chrominance quantization table provided by the callee</param>
+        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee.</param>
+        /// <param name="chrominanceQuantTable">Chrominance quantization table provided by the callee.</param>
        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
        public void Encode444<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
-            this.huffmanTables = HuffmanLut.TheHuffmanLut;
+            FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);
+            FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable);

-            var unzig = ZigZag.CreateUnzigTable();
+            this.huffmanTables = HuffmanLut.TheHuffmanLut;

            // ReSharper disable once InconsistentNaming
            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
@ -100,26 +166,28 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                        QuantIndex.Luminance,
                        prevDCY,
                        ref pixelConverter.Y,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref luminanceQuantTable);

                    prevDCCb = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCb,
                        ref pixelConverter.Cb,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);

                    prevDCCr = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCr,
                        ref pixelConverter.Cr,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);
+
+                    if (this.IsStreamFlushNeeded)
+                    {
+                        this.FlushToStream();
+                    }
                }
            }

-            this.FlushInternalBuffer();
+            this.FlushRemainingBytes();
        }

        /// <summary>
@ -128,15 +196,16 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        /// </summary>
        /// <typeparam name="TPixel">The pixel format.</typeparam>
        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
-        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
-        /// <param name="chrominanceQuantTable">Chrominance quantization table provided by the callee</param>
+        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee.</param>
+        /// <param name="chrominanceQuantTable">Chrominance quantization table provided by the callee.</param>
        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
        public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
-            this.huffmanTables = HuffmanLut.TheHuffmanLut;
+            FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);
+            FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable);

-            var unzig = ZigZag.CreateUnzigTable();
+            this.huffmanTables = HuffmanLut.TheHuffmanLut;

            // ReSharper disable once InconsistentNaming
            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
@ -161,34 +230,35 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                            QuantIndex.Luminance,
                            prevDCY,
                            ref pixelConverter.YLeft,
-                            ref luminanceQuantTable,
-                            ref unzig);
+                            ref luminanceQuantTable);

                        prevDCY = this.WriteBlock(
                            QuantIndex.Luminance,
                            prevDCY,
                            ref pixelConverter.YRight,
-                            ref luminanceQuantTable,
-                            ref unzig);
+                            ref luminanceQuantTable);
                    }

                    prevDCCb = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCb,
                        ref pixelConverter.Cb,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);

                    prevDCCr = this.WriteBlock(
                        QuantIndex.Chrominance,
                        prevDCCr,
                        ref pixelConverter.Cr,
-                        ref chrominanceQuantTable,
-                        ref unzig);
+                        ref chrominanceQuantTable);
+
+                    if (this.IsStreamFlushNeeded)
+                    {
+                        this.FlushToStream();
+                    }
                }
            }

-            this.FlushInternalBuffer();
+            this.FlushRemainingBytes();
        }

        /// <summary>
@ -196,14 +266,14 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        /// </summary>
        /// <typeparam name="TPixel">The pixel format.</typeparam>
        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
-        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
+        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee.</param>
        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
        public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
            where TPixel : unmanaged, IPixel<TPixel>
        {
-            this.huffmanTables = HuffmanLut.TheHuffmanLut;
+            FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);

-            var unzig = ZigZag.CreateUnzigTable();
+            this.huffmanTables = HuffmanLut.TheHuffmanLut;

            // ReSharper disable once InconsistentNaming
            int prevDCY = 0;
@ -226,12 +296,76 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                        QuantIndex.Luminance,
                        prevDCY,
                        ref pixelConverter.Y,
-                        ref luminanceQuantTable,
-                        ref unzig);
+                        ref luminanceQuantTable);
+
+                    if (this.IsStreamFlushNeeded)
+                    {
+                        this.FlushToStream();
+                    }
                }
            }

-            this.FlushInternalBuffer();
+            this.FlushRemainingBytes();
+        }
+
+        /// <summary>
+        /// Encodes the image with no subsampling and keeps the pixel data as Rgb24.
+        /// </summary>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
+        /// <param name="quantTable">Quantization table provided by the callee.</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
+        public void EncodeRgb<TPixel>(Image<TPixel> pixels, ref Block8x8F quantTable, CancellationToken cancellationToken)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            FastFloatingPointDCT.AdjustToFDCT(ref quantTable);
+
+            this.huffmanTables = HuffmanLut.TheHuffmanLut;
+
+            // ReSharper disable once InconsistentNaming
+            int prevDCR = 0, prevDCG = 0, prevDCB = 0;
+
+            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
+            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
+            RowOctet<TPixel> currentRows = default;
+
+            var pixelConverter = new RgbForwardConverter<TPixel>(frame);
+
+            for (int y = 0; y < pixels.Height; y += 8)
+            {
+                cancellationToken.ThrowIfCancellationRequested();
+                currentRows.Update(pixelBuffer, y);
+
+                for (int x = 0; x < pixels.Width; x += 8)
+                {
+                    pixelConverter.Convert(x, y, ref currentRows);
+
+                    prevDCR = this.WriteBlock(
+                        QuantIndex.Luminance,
+                        prevDCR,
+                        ref pixelConverter.R,
+                        ref quantTable);
+
+                    prevDCG = this.WriteBlock(
+                        QuantIndex.Luminance,
+                        prevDCG,
+                        ref pixelConverter.G,
+                        ref quantTable);
+
+                    prevDCB = this.WriteBlock(
+                        QuantIndex.Luminance,
+                        prevDCB,
+                        ref pixelConverter.B,
+                        ref quantTable);
+
+                    if (this.IsStreamFlushNeeded)
+                    {
+                        this.FlushToStream();
+                    }
+                }
+            }
+
+            this.FlushRemainingBytes();
        }

        /// <summary>
@ -241,47 +375,53 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        /// </summary>
        /// <param name="index">The quantization table index.</param>
        /// <param name="prevDC">The previous DC value.</param>
-        /// <param name="src">Source block</param>
-        /// <param name="quant">Quantization table</param>
-        /// <param name="unZig">The 8x8 Unzig block.</param>
+        /// <param name="block">Source block.</param>
+        /// <param name="quant">Quantization table.</param>
        /// <returns>The <see cref="int"/>.</returns>
        private int WriteBlock(
            QuantIndex index,
            int prevDC,
-            ref Block8x8F src,
-            ref Block8x8F quant,
-            ref ZigZag unZig)
+            ref Block8x8F block,
+            ref Block8x8F quant)
        {
-            ref Block8x8F refTemp1 = ref this.temporalBlock1;
-            ref Block8x8F refTemp2 = ref this.temporalBlock2;
+            ref Block8x8 spectralBlock = ref this.tempBlock;

-            FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2);
+            // Shifting level from 0..255 to -128..127
+            block.AddInPlace(-128f);

-            Block8x8F.Quantize(ref refTemp1, ref refTemp2, ref quant, ref unZig);
+            // Discrete cosine transform
+            FastFloatingPointDCT.TransformFDCT(ref block);
+
+            // Quantization
+            Block8x8F.Quantize(ref block, ref spectralBlock, ref quant);

            // Emit the DC delta.
-            int dc = (int)refTemp2[0];
-            this.EmitDirectCurrentTerm(this.huffmanTables[2 * (int)index].Values, dc - prevDC);
+            int dc = spectralBlock[0];
+            this.EmitHuffRLE(this.huffmanTables[2 * (int)index].Values, 0, dc - prevDC);

            // Emit the AC components.
            int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values;

+            nint lastValuableIndex = spectralBlock.GetLastNonZeroIndex();
+
            int runLength = 0;
-            int lastValuableIndex = GetLastValuableElementIndex(ref refTemp2);
-            for (int zig = 1; zig <= lastValuableIndex; zig++)
+            ref short blockRef = ref Unsafe.As<Block8x8, short>(ref spectralBlock);
+            for (nint zig = 1; zig <= lastValuableIndex; zig++)
            {
-                int ac = (int)refTemp2[zig];
+                const int zeroRun1 = 1 << 4;
+                const int zeroRun16 = 16 << 4;

+                int ac = Unsafe.Add(ref blockRef, zig);
                if (ac == 0)
                {
-                    runLength++;
+                    runLength += zeroRun1;
                }
                else
                {
-                    while (runLength > 15)
+                    while (runLength >= zeroRun16)
                    {
                        this.EmitHuff(acHuffTable, 0xf0);
-                        runLength -= 16;
+                        runLength -= zeroRun16;
                    }

                    this.EmitHuffRLE(acHuffTable, runLength, ac);
@ -301,100 +441,89 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        }

        /// <summary>
-        /// Emits the least significant count of bits to the stream write buffer.
-        /// The precondition is bits
-        /// <example>
-        /// &lt; 1&lt;&lt;nBits &amp;&amp; nBits &lt;= 16
-        /// </example>
-        /// .
+        /// Emits the most significant count of bits to the buffer.
        /// </summary>
-        /// <param name="bits">The packed bits.</param>
-        /// <param name="count">The number of bits</param>
+        /// <remarks>
+        /// <para>
+        /// Supports up to 32 count of bits but, generally speaking, jpeg
+        /// standard assures that there won't be more than 16 bits per single
+        /// value.
+        /// </para>
+        /// <para>
+        /// Emitting algorithm uses 3 intermediate buffers for caching before
+        /// writing to the stream:
+        /// <list type="number">
+        /// <item>
+        /// <term>uint32</term>
+        /// <description>
+        /// Bit buffer. Encoded spectral values can occupy up to 16 bits, bits
+        /// are assembled to whole bytes via this intermediate buffer.
+        /// </description>
+        /// </item>
+        /// <item>
+        /// <term>uint32[]</term>
+        /// <description>
+        /// Assembled bytes from uint32 buffer are saved into this buffer.
+        /// uint32 buffer values are saved using indices from the last to the first.
+        /// As bytes are saved to the memory as 4-byte packages endianness matters:
+        /// Jpeg stream is big-endian, indexing buffer bytes from the last index to the
+        /// first eliminates all operations to extract separate bytes. This only works for
+        /// little-endian machines (there are no known examples of big-endian users atm).
+        /// For big-endians this approach is slower due to the separate byte extraction.
+        /// </description>
+        /// </item>
+        /// <item>
+        /// <term>byte[]</term>
+        /// <description>
+        /// Byte buffer used only during <see cref="FlushToStream(int)"/> method.
+        /// </description>
+        /// </item>
+        /// </list>
+        /// </para>
+        /// </remarks>
+        /// <param name="bits">Bits to emit, must be shifted to the left.</param>
+        /// <param name="count">Bits count stored in the bits parameter.</param>
        [MethodImpl(InliningOptions.ShortMethod)]
-        private void Emit(int bits, int count)
+        private void Emit(uint bits, int count)
        {
+            this.accumulatedBits |= bits >> this.bitCount;
+
            count += this.bitCount;
-            bits <<= 32 - count;
-            bits |= this.accumulatedBits;

-            // Only write if more than 8 bits.
-            if (count >= 8)
+            if (count >= 32)
            {
-                // Track length
-                while (count >= 8)
-                {
-                    byte b = (byte)(bits >> 24);
-                    this.emitBuffer[this.emitLen++] = b;
-
-                    // Adding stuff byte
-                    // This is because by JPEG standard scan data can contain JPEG markers (indicated by the 0xFF byte, followed by a non-zero byte)
-                    // Considering this every 0xFF byte must be followed by 0x00 padding byte to signal that this is not a marker
-                    if (b == byte.MaxValue)
-                    {
-                        this.emitBuffer[this.emitLen++] = byte.MinValue;
-                    }
-
-                    bits <<= 8;
-                    count -= 8;
-                }
+                this.emitBuffer[--this.emitWriteIndex] = this.accumulatedBits;
+                this.accumulatedBits = bits << (32 - this.bitCount);

-                // This can emit 4 times of:
-                // 1 byte guaranteed
-                // 1 extra byte.MinValue byte if previous one was byte.MaxValue
-                // Thus writing (1 + 1) * 4 = 8 bytes max
-                // So we must check if emit buffer has extra 8 bytes, if not - call stream.Write
-                if (this.emitLen > EmitBufferSizeInBytes - 8)
-                {
-                    this.target.Write(this.emitBuffer, 0, this.emitLen);
-                    this.emitLen = 0;
-                }
+                count -= 32;
            }

-            this.accumulatedBits = bits;
            this.bitCount = count;
        }

        /// <summary>
-        /// Emits the given value with the given Huffman encoder.
+        /// Emits the given value with the given Huffman table.
        /// </summary>
-        /// <param name="table">Compiled Huffman spec values.</param>
-        /// <param name="value">The value to encode.</param>
+        /// <param name="table">Huffman table.</param>
+        /// <param name="value">Value to encode.</param>
        [MethodImpl(InliningOptions.ShortMethod)]
        private void EmitHuff(int[] table, int value)
        {
            int x = table[value];
-            this.Emit(x >> 8, x & 0xff);
-        }
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitDirectCurrentTerm(int[] table, int value)
-        {
-            int a = value;
-            int b = value;
-            if (a < 0)
-            {
-                a = -value;
-                b = value - 1;
-            }
-
-            int bt = GetHuffmanEncodingLength((uint)a);
-
-            this.EmitHuff(table, bt);
-            if (bt > 0)
-            {
-                this.Emit(b & ((1 << bt) - 1), bt);
-            }
+            this.Emit((uint)x & 0xffff_ff00u, x & 0xff);
        }

        /// <summary>
-        /// Emits a run of runLength copies of value encoded with the given Huffman encoder.
+        /// Emits given value via huffman rle encoding.
        /// </summary>
-        /// <param name="table">Compiled Huffman spec values.</param>
-        /// <param name="runLength">The number of copies to encode.</param>
-        /// <param name="value">The value to encode.</param>
+        /// <param name="table">Huffman table.</param>
+        /// <param name="runLength">The number of preceding zeroes, preshifted by 4 to the left.</param>
+        /// <param name="value">Value to encode.</param>
        [MethodImpl(InliningOptions.ShortMethod)]
        private void EmitHuffRLE(int[] table, int runLength, int value)
        {
+            DebugGuard.IsTrue((runLength & 0xf) == 0, $"{nameof(runLength)} parameter must be shifted to the left by 4 bits");
+
            int a = value;
            int b = value;
            if (a < 0)
@ -403,25 +532,18 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
                b = value - 1;
            }

-            int bt = GetHuffmanEncodingLength((uint)a);
+            int valueLen = GetHuffmanEncodingLength((uint)a);

-            this.EmitHuff(table, (runLength << 4) | bt);
-            this.Emit(b & ((1 << bt) - 1), bt);
-        }
+            // Huffman prefix code
+            int huffPackage = table[runLength | valueLen];
+            int prefixLen = huffPackage & 0xff;
+            uint prefix = (uint)huffPackage & 0xffff_0000u;

-        /// <summary>
-        /// Writes remaining bytes from internal buffer to the target stream.
-        /// </summary>
-        /// <remarks>Pads last byte with 1's if necessary</remarks>
-        private void FlushInternalBuffer()
-        {
-            // pad last byte with 1's
-            int padBitsCount = 8 - (this.bitCount % 8);
-            if (padBitsCount != 0)
-            {
-                this.Emit((1 << padBitsCount) - 1, padBitsCount);
-                this.target.Write(this.emitBuffer, 0, this.emitLen);
-            }
+            // Actual encoded value
+            uint encodedValue = (uint)b << (32 - valueLen);
+
+            // Doing two binary shifts to get rid of leading 1's in negative value case
+            this.Emit(prefix | (encodedValue >> prefixLen), prefixLen + valueLen);
        }

        /// <summary>
@ -437,19 +559,19 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
            DebugGuard.IsTrue(value <= (1 << 16), "Huffman encoder is supposed to encode a value of 16bit size max");
 #if SUPPORTS_BITOPERATIONS
            // This should have been implemented as (BitOperations.Log2(value) + 1) as in non-intrinsic implementation
-            // But internal log2 is implementated like this: (31 - (int)Lzcnt.LeadingZeroCount(value))
+            // But internal log2 is implemented like this: (31 - (int)Lzcnt.LeadingZeroCount(value))

            // BitOperations.Log2 implementation also checks if input value is zero for the convention 0->0
            // Lzcnt would return 32 for input value of 0 - no need to check that with branching
            // Fallback code if Lzcnt is not supported still use if-check
            // But most modern CPUs support this instruction so this should not be a problem
-            return 32 - System.Numerics.BitOperations.LeadingZeroCount(value);
+            return 32 - BitOperations.LeadingZeroCount(value);
 #else
            // Ideally:
            // if 0 - return 0 in this case
            // else - return log2(value) + 1
            //
-            // Hack based on input value constaint:
+            // Hack based on input value constraint:
            // We know that input values are guaranteed to be maximum 16 bit large for huffman encoding
            // We can safely shift input value for one bit -> log2(value << 1)
            // Because of the 16 bit value constraint it won't overflow
@ -460,65 +582,108 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        }

        /// <summary>
-        /// Returns index of the last non-zero element in given mcu block.
-        /// If all values of the mcu block are zero, this method might return different results depending on the runtime and hardware support.
-        /// This is jpeg mcu specific code, mcu[0] stores a dc value which will be encoded outside of the loop.
-        /// This method is guaranteed to return either -1 or 0 if all elements are zero.
+        /// General method for flushing cached spectral data bytes to
+        /// the ouput stream respecting stuff bytes.
        /// </summary>
        /// <remarks>
-        /// This is an internal operation supposed to be used only in <see cref="HuffmanScanEncoder"/> class for jpeg encoding.
+        /// Bytes cached via <see cref="Emit"/> are stored in 4-bytes blocks
+        /// which makes this method endianness dependent.
        /// </remarks>
-        /// <param name="mcu">Mcu block.</param>
-        /// <returns>Index of the last non-zero element.</returns>
        [MethodImpl(InliningOptions.ShortMethod)]
-        internal static int GetLastValuableElementIndex(ref Block8x8F mcu)
+        private void FlushToStream(int endIndex)
        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
-            {
-                const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+            Span<byte> emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan());

-                Vector256<int> zero8 = Vector256<int>.Zero;
+            int writeIdx = 0;
+            int startIndex = emitBytes.Length - 1;

-                ref Vector256<float> mcuStride = ref mcu.V0;
-
-                for (int i = 7; i >= 0; i--)
+            // Some platforms may fail to eliminate this if-else branching
+            // Even if it happens - buffer is flushed in big packs,
+            // branching overhead shouldn't be noticeable
+            if (BitConverter.IsLittleEndian)
+            {
+                // For little endian case bytes are ordered and can be
+                // safely written to the stream with stuff bytes
+                // First byte is cached on the most significant index
+                // so we are going from the end of the array to its beginning:
+                // ... [  double word #1   ] [  double word #0   ]
+                // ... [idx3|idx2|idx1|idx0] [idx3|idx2|idx1|idx0]
+                for (int i = startIndex; i >= endIndex; i--)
                {
-                    int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32(Unsafe.Add(ref mcuStride, i)), zero8).AsByte());
+                    byte value = emitBytes[i];
+                    this.streamWriteBuffer[writeIdx++] = value;

-                    // we do not know for sure if this stride contain all non-zero elements or if it has some trailing zeros
-                    if (areEqual != equalityMask)
+                    // Inserting stuff byte
+                    if (value == 0xff)
                    {
-                        // last index in the stride, we go from the end to the start of the stride
-                        int startIndex = i * 8;
-                        int index = startIndex + 7;
-                        ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref mcu);
-                        while (index >= startIndex && (int)Unsafe.Add(ref elemRef, index) == 0)
-                        {
-                            index--;
-                        }
-
-                        // this implementation will return -1 if all ac components are zero and dc are zero
-                        return index;
+                        this.streamWriteBuffer[writeIdx++] = 0x00;
                    }
                }
-
-                return -1;
            }
            else
-#endif
            {
-                int index = Block8x8F.Size - 1;
-                ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref mcu);
-
-                while (index > 0 && (int)Unsafe.Add(ref elemRef, index) == 0)
+                // For big endian case bytes are ordered in 4-byte packs
+                // which are ordered like bytes in the little endian case by in 4-byte packs:
+                // ... [  double word #1   ] [  double word #0   ]
+                // ... [idx0|idx1|idx2|idx3] [idx0|idx1|idx2|idx3]
+                // So we must write each 4-bytes in 'natural order'
+                for (int i = startIndex; i >= endIndex; i -= 4)
                {
-                    index--;
-                }
+                    // This loop is caused by the nature of underlying byte buffer
+                    // implementation and indeed causes performace by somewhat 5%
+                    // compared to little endian scenario
+                    // Even with this performance drop this cached buffer implementation
+                    // is faster than individually writing bytes using binary shifts and binary and(s)
+                    for (int j = i - 3; j <= i; j++)
+                    {
+                        byte value = emitBytes[j];
+                        this.streamWriteBuffer[writeIdx++] = value;

-                // this implementation will return 0 if all ac components and dc are zero
-                return index;
+                        // Inserting stuff byte
+                        if (value == 0xff)
+                        {
+                            this.streamWriteBuffer[writeIdx++] = 0x00;
+                        }
+                    }
+                }
            }
+
+            this.target.Write(this.streamWriteBuffer, 0, writeIdx);
+        }
+
+        /// <summary>
+        /// Flushes spectral data bytes after encoding all channel blocks
+        /// in a single jpeg macroblock using <see cref="WriteBlock"/>.
+        /// </summary>
+        /// <remarks>
+        /// This must be called only if <see cref="IsStreamFlushNeeded"/> is true
+        /// only during the macroblocks encoding routine.
+        /// </remarks>
+        private void FlushToStream()
+        {
+            this.FlushToStream(this.emitWriteIndex * 4);
+            this.emitWriteIndex = this.emitBuffer.Length;
+        }
+
+        /// <summary>
+        /// Flushes final cached bits to the stream padding 1's to
+        /// complement full bytes.
+        /// </summary>
+        /// <remarks>
+        /// This must be called only once at the end of the encoding routine.
+        /// <see cref="IsStreamFlushNeeded"/> check is not needed.
+        /// </remarks>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void FlushRemainingBytes()
+        {
+            // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits
+            // And writing only valuable count of bytes count we want to write to the output stream
+            int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8);
+            uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount);
+            this.emitBuffer[--this.emitWriteIndex] = packedBytes;
+
+            // Flush cached bytes to the output stream with padding bits
+            this.FlushToStream((this.emitWriteIndex * 4) - 4 + valuableBytesCount);
        }
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/QuantIndex.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/QuantIndex.cs
@ -1,21 +1,21 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
    /// <summary>
-    ///     Enumerates the quantization tables
+    /// Enumerates the quantization tables.
    /// </summary>
    internal enum QuantIndex
    {
        /// <summary>
-        ///     The luminance quantization table index
+        /// The luminance quantization table index.
        /// </summary>
        Luminance = 0,

        /// <summary>
-        ///     The chrominance quantization table index
+        /// The chrominance quantization table index.
        /// </summary>
        Chrominance = 1,
    }
-}
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbForwardConverter{TPixel}.cs
@ -0,0 +1,114 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Advanced;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+    /// <summary>
+    /// On-stack worker struct to convert TPixel -> Rgb24 of 8x8 pixel blocks.
+    /// </summary>
+    /// <typeparam name="TPixel">The pixel type to work on.</typeparam>
+    internal ref struct RgbForwardConverter<TPixel>
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        /// <summary>
+        /// Number of pixels processed per single <see cref="Convert(int, int, ref RowOctet{TPixel})"/> call
+        /// </summary>
+        private const int PixelsPerSample = 8 * 8;
+
+        /// <summary>
+        /// Total byte size of processed pixels converted from TPixel to <see cref="Rgb24"/>
+        /// </summary>
+        private const int RgbSpanByteSize = PixelsPerSample * 3;
+
+        /// <summary>
+        /// <see cref="Size"/> of sampling area from given frame pixel buffer.
+        /// </summary>
+        private static readonly Size SampleSize = new Size(8, 8);
+
+        /// <summary>
+        /// The Red component.
+        /// </summary>
+        public Block8x8F R;
+
+        /// <summary>
+        /// The Green component.
+        /// </summary>
+        public Block8x8F G;
+
+        /// <summary>
+        /// The Blue component.
+        /// </summary>
+        public Block8x8F B;
+
+        /// <summary>
+        /// Temporal 64-byte span to hold unconverted TPixel data.
+        /// </summary>
+        private readonly Span<TPixel> pixelSpan;
+
+        /// <summary>
+        /// Temporal 64-byte span to hold converted Rgb24 data.
+        /// </summary>
+        private readonly Span<Rgb24> rgbSpan;
+
+        /// <summary>
+        /// Sampled pixel buffer size.
+        /// </summary>
+        private readonly Size samplingAreaSize;
+
+        /// <summary>
+        /// <see cref="Configuration"/> for internal operations.
+        /// </summary>
+        private readonly Configuration config;
+
+        public RgbForwardConverter(ImageFrame<TPixel> frame)
+        {
+            this.R = default;
+            this.G = default;
+            this.B = default;
+
+            // temporal pixel buffers
+            this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
+            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan());
+
+            // frame data
+            this.samplingAreaSize = new Size(frame.Width, frame.Height);
+            this.config = frame.GetConfiguration();
+        }
+
+        /// <summary>
+        /// Converts a 8x8 image area inside 'pixels' at position (x, y) to Rgb24.
+        /// </summary>
+        public void Convert(int x, int y, ref RowOctet<TPixel> currentRows)
+        {
+            YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize);
+
+            PixelOperations<TPixel>.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan);
+
+            ref Block8x8F redBlock = ref this.R;
+            ref Block8x8F greenBlock = ref this.G;
+            ref Block8x8F blueBlock = ref this.B;
+
+            CopyToBlock(this.rgbSpan, ref redBlock, ref greenBlock, ref blueBlock);
+        }
+
+        private static void CopyToBlock(Span<Rgb24> rgbSpan, ref Block8x8F redBlock, ref Block8x8F greenBlock, ref Block8x8F blueBlock)
+        {
+            ref Rgb24 rgbStart = ref MemoryMarshal.GetReference(rgbSpan);
+
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                Rgb24 c = Unsafe.Add(ref rgbStart, (nint)(uint)i);
+
+                redBlock[i] = c.R;
+                greenBlock[i] = c.G;
+                blueBlock[i] = c.B;
+            }
+        }
+    }
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
@ -58,22 +58,22 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        /// <summary>
        /// Temporal 16x8 block to hold TPixel data
        /// </summary>
-        private Span<TPixel> pixelSpan;
+        private readonly Span<TPixel> pixelSpan;

        /// <summary>
        /// Temporal RGB block
        /// </summary>
-        private Span<Rgb24> rgbSpan;
+        private readonly Span<Rgb24> rgbSpan;

        /// <summary>
        /// Sampled pixel buffer size
        /// </summary>
-        private Size samplingAreaSize;
+        private readonly Size samplingAreaSize;

        /// <summary>
        /// <see cref="Configuration"/> for internal operations
        /// </summary>
-        private Configuration config;
+        private readonly Configuration config;

        public YCbCrForwardConverter420(ImageFrame<TPixel> frame)
        {
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@ -53,22 +53,22 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
        /// <summary>
        /// Temporal 64-byte span to hold unconverted TPixel data
        /// </summary>
-        private Span<TPixel> pixelSpan;
+        private readonly Span<TPixel> pixelSpan;

        /// <summary>
        /// Temporal 64-byte span to hold converted Rgb24 data
        /// </summary>
-        private Span<Rgb24> rgbSpan;
+        private readonly Span<Rgb24> rgbSpan;

        /// <summary>
        /// Sampled pixel buffer size
        /// </summary>
-        private Size samplingAreaSize;
+        private readonly Size samplingAreaSize;

        /// <summary>
        /// <see cref="Configuration"/> for internal operations
        /// </summary>
-        private Configuration config;
+        private readonly Configuration config;

        public YCbCrForwardConverter444(ImageFrame<TPixel> frame)
        {