Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,36 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
{
internal abstract partial class JpegColorConverter
{
internal sealed class FromCmyk : JpegColorConverter
internal sealed class FromCmykBasic : JpegColorConverter
{
public FromCmyk(int precision)
public FromCmykBasic(int precision)
: base(JpegColorSpace.Cmyk, precision)
{
}

public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
{
// TODO: We can optimize a lot here with Vector<float> and SRCS.Unsafe()!
ConvertCore(values, result, this.MaximumValue);
}

internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue)
{
ReadOnlySpan<float> cVals = values.Component0;
ReadOnlySpan<float> mVals = values.Component1;
ReadOnlySpan<float> yVals = values.Component2;
ReadOnlySpan<float> kVals = values.Component3;

var v = new Vector4(0, 0, 0, 1F);

var maximum = 1 / this.MaximumValue;
var maximum = 1 / maxValue;
var scale = new Vector4(maximum, maximum, maximum, 1F);

for (int i = 0; i < result.Length; i++)
{
float c = cVals[i];
float m = mVals[i];
float y = yVals[i];
float k = kVals[i] / this.MaximumValue;
float k = kVals[i] / maxValue;

v.X = c * k;
v.Y = m * k;
Expand All @@ -47,4 +51,4 @@ public override void ConvertToRgba(in ComponentValues values, Span<Vector4> resu
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using static SixLabors.ImageSharp.SimdUtils;
#else
using SixLabors.ImageSharp.Tuples;
#endif

namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
{
internal abstract partial class JpegColorConverter
{
internal sealed class FromCmykVector8 : JpegColorConverter
{
public FromCmykVector8(int precision)
: base(JpegColorSpace.Cmyk, precision)
{
}

public static bool IsAvailable => Vector.IsHardwareAccelerated && SimdUtils.HasVector8;

public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
{
int remainder = result.Length % 8;
int simdCount = result.Length - remainder;
if (simdCount > 0)
{
ConvertCore(values.Slice(0, simdCount), result.Slice(0, simdCount), this.MaximumValue);
}

FromCmykBasic.ConvertCore(values.Slice(simdCount, remainder), result.Slice(simdCount, remainder), this.MaximumValue);
}

internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue)
{
// This implementation is actually AVX specific.
// An AVX register is capable of storing 8 float-s.
if (!IsAvailable)
{
throw new InvalidOperationException(
"JpegColorConverter.FromGrayscaleVector8 can be used only on architecture having 256 byte floating point SIMD registers!");
}

#if SUPPORTS_RUNTIME_INTRINSICS
ref Vector256<float> cBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector256<float> mBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector256<float> yBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector256<float> kBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3));

ref Vector256<float> resultBase =
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result));

// Used for the color conversion
var scale = Vector256.Create(1 / maxValue);
var one = Vector256.Create(1F);

// Used for packing
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32);
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);

int n = result.Length / 8;
for (int i = 0; i < n; i++)
{
Vector256<float> k = Avx2.PermuteVar8x32(Unsafe.Add(ref kBase, i), vcontrol);
Vector256<float> c = Avx2.PermuteVar8x32(Unsafe.Add(ref cBase, i), vcontrol);
Vector256<float> m = Avx2.PermuteVar8x32(Unsafe.Add(ref mBase, i), vcontrol);
Vector256<float> y = Avx2.PermuteVar8x32(Unsafe.Add(ref yBase, i), vcontrol);

k = Avx.Multiply(k, scale);

c = Avx.Multiply(Avx.Multiply(c, k), scale);
m = Avx.Multiply(Avx.Multiply(m, k), scale);
y = Avx.Multiply(Avx.Multiply(y, k), scale);

Vector256<float> cmLo = Avx.UnpackLow(c, m);
Vector256<float> yoLo = Avx.UnpackLow(y, one);
Vector256<float> cmHi = Avx.UnpackHigh(c, m);
Vector256<float> yoHi = Avx.UnpackHigh(y, one);

ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4);

destination = Avx.Shuffle(cmLo, yoLo, 0b01_00_01_00);
Unsafe.Add(ref destination, 1) = Avx.Shuffle(cmLo, yoLo, 0b11_10_11_10);
Unsafe.Add(ref destination, 2) = Avx.Shuffle(cmHi, yoHi, 0b01_00_01_00);
Unsafe.Add(ref destination, 3) = Avx.Shuffle(cmHi, yoHi, 0b11_10_11_10);
}
#else
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't have chance to react, but I'm not entirely happy with the practice established in #1402. I would try putting the different paths into different classes (FromCmykVector8 VS FromCmykVectorAvx2).
FromCmykVectorAvx2 can derive from FromCmykVector8 for convenience.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that was a stopgap. I’d like to revisit

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't shy away from inheritance here, I would suggest a hierarchy like this:

abstract class JpegColorConverter
{
    public abstract bool IsAvailable { get; } // new on this class
}

abstract class BasicJpegColorConverter : JpegColorConverter // new base class for all non-vectorized color converters
{
    public override bool IsAvailable => true;
}

abstract class VectorizedJpegColorConverter : JpegColorConverter // new base class for all vectorized color converters
{
    private readonly int vectorSize;

    protected VectorizedJpegColorConverter(JpegColorSpace colorSpace, int precision, int vectorSize)
        : base(colorSpace, precision)
    {
        this.vectorSize = vectorSize;
    }

    public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
    {
        int remainder = result.Length % vectorSize;
        int simdCount = result.Length - remainder;
        if (simdCount > 0)
        {
            ConvertCoreVectorized(values.Slice(0, simdCount), result.Slice(0, simdCount));
        }

        ConvertCore(values.Slice(simdCount, remainder), result.Slice(simdCount, remainder));
    }

    protected abstract void ConvertCoreVectorized(in ComponentValues values, Span<Vector4> result);

    protected abstract void ConvertCore(in ComponentValues values, Span<Vector4> result);
}

abstract class Avx2JpegColorConverter : VectorizedJpegColorConverter // new base class for all AVX2-based converters
{
    protected Avx2JpegColorConverter(JpegColorSpace colorSpace, int precision)
        : base(colorSpace, precision, 8)
    {
    }

    public override bool IsAvailable
    {
        get
        {
#if SUPPORTS_RUNTIME_INTRINSICS
            return Avx2.IsSupported;
#else
            return false;
#endif
        }
    }
}

// another base class for Vector<float>-based converters

We could then instantiate all converters initially into a static array and determine which one to use based on ColorSpace, Precision and IsAvailable. This would address your point and maximize code re-use at the cost of some virtual method calls.

Let me know what you think. I didn't want to refactor to much for the initial draft-PR and just add the vectorized logic into the existing code structure.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tkp1n I like the concept, go for it! Hopefully the runtime costs of the virtual dispatches are be still negligible.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I refactored as discussed above, PTAL...

Next up: Benchmarks 🚀

ref Vector<float> cBase =
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0));
ref Vector<float> mBase =
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component1));
ref Vector<float> yBase =
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component2));
ref Vector<float> kBase =
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component3));

ref Vector4Octet resultBase =
ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result));

Vector4Pair cc = default;
Vector4Pair mm = default;
Vector4Pair yy = default;
ref Vector<float> ccRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref cc);
ref Vector<float> mmRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref mm);
ref Vector<float> yyRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref yy);

var scale = new Vector<float>(1 / maxValue);

// Walking 8 elements at one step:
int n = result.Length / 8;
for (int i = 0; i < n; i++)
{
Vector<float> c = Unsafe.Add(ref cBase, i);
Vector<float> m = Unsafe.Add(ref mBase, i);
Vector<float> y = Unsafe.Add(ref yBase, i);
Vector<float> k = Unsafe.Add(ref kBase, i) * scale;

c = (c * k) * scale;
m = (m * k) * scale;
y = (y * k) * scale;

ccRefAsVector = c;
mmRefAsVector = m;
yyRefAsVector = y;

// Collect (c0,c1...c8) (m0,m1...m8) (y0,y1...y8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
destination.Pack(ref cc, ref mm, ref yy);
}
#endif
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,21 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
{
internal abstract partial class JpegColorConverter
{
internal sealed class FromGrayscale : JpegColorConverter
internal sealed class FromGrayscaleBasic : JpegColorConverter
{
public FromGrayscale(int precision)
public FromGrayscaleBasic(int precision)
: base(JpegColorSpace.Grayscale, precision)
{
}

public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
{
var maximum = 1 / this.MaximumValue;
ConvertCore(values, result, this.MaximumValue);
}

internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue)
{
var maximum = 1 / maxValue;
var scale = new Vector4(maximum, maximum, maximum, 1F);

ref float sBase = ref MemoryMarshal.GetReference(values.Component0);
Expand All @@ -35,4 +40,4 @@ public override void ConvertToRgba(in ComponentValues values, Span<Vector4> resu
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using static SixLabors.ImageSharp.SimdUtils;
#else
using SixLabors.ImageSharp.Tuples;
#endif

namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
{
internal abstract partial class JpegColorConverter
{
internal sealed class FromGrayscaleVector8 : JpegColorConverter
{
public FromGrayscaleVector8(int precision)
: base(JpegColorSpace.Grayscale, precision)
{
}

public static bool IsAvailable => Vector.IsHardwareAccelerated && SimdUtils.HasVector8;

public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
{
int remainder = result.Length % 8;
int simdCount = result.Length - remainder;
if (simdCount > 0)
{
ConvertCore(values.Slice(0, simdCount), result.Slice(0, simdCount), this.MaximumValue);
}

FromGrayscaleBasic.ConvertCore(values.Slice(simdCount, remainder), result.Slice(simdCount, remainder), this.MaximumValue);
}

internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue)
{
// This implementation is actually AVX specific.
// An AVX register is capable of storing 8 float-s.
if (!IsAvailable)
{
throw new InvalidOperationException(
"JpegColorConverter.FromGrayscaleVector8 can be used only on architecture having 256 byte floating point SIMD registers!");
}

#if SUPPORTS_RUNTIME_INTRINSICS
ref Vector256<float> gBase =
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));

ref Vector256<float> resultBase =
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(result));

// Used for the color conversion
var scale = Vector256.Create(1 / maxValue);
var one = Vector256.Create(1F);

// Used for packing
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32);
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);

int n = result.Length / 8;
for (int i = 0; i < n; i++)
{
Vector256<float> g = Avx2.PermuteVar8x32(Unsafe.Add(ref gBase, i), vcontrol);

g = Avx.Multiply(g, scale);

ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4);

destination = Avx.Blend(Avx.Permute(g, 0b00_00_00_00), one, 0b1000_1000);
Unsafe.Add(ref destination, 1) = Avx.Blend(Avx.Permute(g, 0b01_01_01_01), one, 0b1000_1000);
Unsafe.Add(ref destination, 2) = Avx.Blend(Avx.Permute(g, 0b10_10_10_10), one, 0b1000_1000);
Unsafe.Add(ref destination, 3) = Avx.Blend(Avx.Permute(g, 0b11_11_11_11), one, 0b1000_1000);
}
#else
ref Vector<float> gBase =
ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(values.Component0));

ref Vector4Octet resultBase =
ref Unsafe.As<Vector4, Vector4Octet>(ref MemoryMarshal.GetReference(result));

Vector4Pair gg = default;
ref Vector<float> ggRefAsVector = ref Unsafe.As<Vector4Pair, Vector<float>>(ref gg);

var scale = new Vector<float>(1 / maxValue);

// Walking 8 elements at one step:
int n = result.Length / 8;
for (int i = 0; i < n; i++)
{
Vector<float> g = Unsafe.Add(ref gBase, i);
g *= scale;

ggRefAsVector = g;

// Collect (g0,g1...g7) vector values in the expected (g0,g0,g0,1), (g1,g1,g1,1) ... order:
ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
destination.Pack(ref gg);
}
#endif
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,27 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
{
internal abstract partial class JpegColorConverter
{
internal sealed class FromRgb : JpegColorConverter
internal sealed class FromRgbBasic : JpegColorConverter
{
public FromRgb(int precision)
public FromRgbBasic(int precision)
: base(JpegColorSpace.RGB, precision)
{
}

public override void ConvertToRgba(in ComponentValues values, Span<Vector4> result)
{
// TODO: We can optimize a lot here with Vector<float> and SRCS.Unsafe()!
ConvertCore(values, result, this.MaximumValue);
}

internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue)
{
ReadOnlySpan<float> rVals = values.Component0;
ReadOnlySpan<float> gVals = values.Component1;
ReadOnlySpan<float> bVals = values.Component2;

var v = new Vector4(0, 0, 0, 1);

var maximum = 1 / this.MaximumValue;
var maximum = 1 / maxValue;
var scale = new Vector4(maximum, maximum, maximum, 1F);

for (int i = 0; i < result.Length; i++)
Expand All @@ -44,4 +48,4 @@ public override void ConvertToRgba(in ComponentValues values, Span<Vector4> resu
}
}
}
}
}
Loading