SIMD support for math intrinsics by simeonschaub · Pull Request #379 · JuliaGPU/OpenCL.jl

simeonschaub · 2025-10-10T09:43:19Z

closes #376

github-actions · 2025-10-10T09:44:20Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/lib/intrinsics/ext/SPIRVIntrinsicsSIMDExt.jl b/lib/intrinsics/ext/SPIRVIntrinsicsSIMDExt.jl
index 81daf1b..9b62da4 100644
--- a/lib/intrinsics/ext/SPIRVIntrinsicsSIMDExt.jl
+++ b/lib/intrinsics/ext/SPIRVIntrinsicsSIMDExt.jl
@@ -9,8 +9,8 @@ const known_intrinsics = String[]
 
 # Generate vectorized math intrinsics
 for N in [2, 3, 4, 8, 16], T in [Float16, Float32, Float64]
-    VT = :(Vec{$N,$T})
-    LVT = :(SIMD.LVec{$N,$T})
+    VT = :(Vec{$N, $T})
+    LVT = :(SIMD.LVec{$N, $T})
 
     @eval begin
         # Unary operations
@@ -98,8 +98,8 @@ for N in [2, 3, 4, 8, 16], T in [Float16, Float32, Float64]
     end
 
     # Special operations with Int32 parameters
-    VIntT = :(Vec{$N,Int32})
-    LVIntT = :(SIMD.LVec{$N,Int32})
+    VIntT = :(Vec{$N, Int32})
+    LVIntT = :(SIMD.LVec{$N, Int32})
 
     @eval begin
         @device_function @inline SPIRVIntrinsics.ilogb(x::$VT) = $VIntT(@builtin_ccall("ilogb", $LVIntT, ($LVT,), x.data))
@@ -112,9 +112,9 @@ end
 # nan functions - take unsigned integer codes and return floats
 for N in [2, 3, 4, 8, 16]
     @eval begin
-        @device_function @inline SPIRVIntrinsics.nan(nancode::Vec{$N,UInt16}) = Vec{$N,Float16}(@builtin_ccall("nan", SIMD.LVec{$N,Float16}, (SIMD.LVec{$N,UInt16},), nancode.data))
-        @device_function @inline SPIRVIntrinsics.nan(nancode::Vec{$N,UInt32}) = Vec{$N,Float32}(@builtin_ccall("nan", SIMD.LVec{$N,Float32}, (SIMD.LVec{$N,UInt32},), nancode.data))
-        @device_function @inline SPIRVIntrinsics.nan(nancode::Vec{$N,UInt64}) = Vec{$N,Float64}(@builtin_ccall("nan", SIMD.LVec{$N,Float64}, (SIMD.LVec{$N,UInt64},), nancode.data))
+        @device_function @inline SPIRVIntrinsics.nan(nancode::Vec{$N, UInt16}) = Vec{$N, Float16}(@builtin_ccall("nan", SIMD.LVec{$N, Float16}, (SIMD.LVec{$N, UInt16},), nancode.data))
+        @device_function @inline SPIRVIntrinsics.nan(nancode::Vec{$N, UInt32}) = Vec{$N, Float32}(@builtin_ccall("nan", SIMD.LVec{$N, Float32}, (SIMD.LVec{$N, UInt32},), nancode.data))
+        @device_function @inline SPIRVIntrinsics.nan(nancode::Vec{$N, UInt64}) = Vec{$N, Float64}(@builtin_ccall("nan", SIMD.LVec{$N, Float64}, (SIMD.LVec{$N, UInt64},), nancode.data))
     end
 end
 
diff --git a/lib/intrinsics/src/utils.jl b/lib/intrinsics/src/utils.jl
index 2995872..0f9512d 100644
--- a/lib/intrinsics/src/utils.jl
+++ b/lib/intrinsics/src/utils.jl
@@ -64,7 +64,7 @@ Base.Experimental.@MethodTable(method_table)
 
 macro device_override(ex)
     esc(quote
-        Base.Experimental.@overlay($method_table, $ex)
+            Base.Experimental.@overlay($method_table, $ex)
     end)
 end
 
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 03637b9..547ab1f 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -17,8 +17,8 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) =
            job, fn) ||
     in(fn, known_intrinsics) ||
     let SPIRVIntrinsicsSIMDExt = Base.get_extension(SPIRVIntrinsics, :SPIRVIntrinsicsSIMDExt)
-        SPIRVIntrinsicsSIMDExt !== nothing && in(fn, SPIRVIntrinsicsSIMDExt.known_intrinsics)
-    end ||
+    SPIRVIntrinsicsSIMDExt !== nothing && in(fn, SPIRVIntrinsicsSIMDExt.known_intrinsics)
+end ||
     contains(fn, "__spirv_")
 
 GPUCompiler.kernel_state_type(::OpenCLCompilerJob) = KernelState
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 69150e5..07078a8 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -165,65 +165,65 @@ end
     @test call_on_device(OpenCL.mad, x, y, z) ≈ x * y + z
 end
 
-@testset "SIMD - $N x $T" for N in simd_ns, T in float_types
-    # codegen emits i48 here, which SPIR-V doesn't support
-    # XXX: fix upstream?
-    T == Float16 && N == 3 && continue
-
-    v = Vec{N, T}(ntuple(_ -> rand(T), N))
-
-    # unary ops: sin, cos, sqrt
-    a = call_on_device(sin, v)
-    @test all(a[i] ≈ sin(v[i]) for i in 1:N)
-
-    b = call_on_device(cos, v)
-    @test all(b[i] ≈ cos(v[i]) for i in 1:N)
-
-    c = call_on_device(sqrt, v)
-    @test all(c[i] ≈ sqrt(v[i]) for i in 1:N)
-
-    # binary ops: max, hypot
-    w = Vec{N, T}(ntuple(_ -> rand(T), N))
-    d = call_on_device(max, v, w)
-    @test all(d[i] == max(v[i], w[i]) for i in 1:N)
-
-    broken = ispocl && T == Float16
-    if !broken
-        h = call_on_device(hypot, v, w)
-        @test all(h[i] ≈ hypot(v[i], w[i]) for i in 1:N)
-    end
-
-    # ternary op: fma
-    x = Vec{N, T}(ntuple(_ -> rand(T), N))
-    e = call_on_device(fma, v, w, x)
-    @test all(e[i] ≈ fma(v[i], w[i], x[i]) for i in 1:N)
-
-    # special cases: ilogb, ldexp, ^ with Int32, rootn
-    v_pos = Vec{N, T}(ntuple(_ -> rand(T) + T(1), N))
-    @test call_on_device(OpenCL.ilogb, v_pos) isa Vec{N, Int32} broken = broken
-
-    k = Vec{N, Int32}(ntuple(_ -> rand(Int32.(-5:5)), N))
-    @test let
-        ldexp_result = call_on_device(ldexp, v_pos, k)
-        all(ldexp_result[i] ≈ ldexp(v_pos[i], k[i]) for i in 1:N)
-    end broken = broken
-
-    base = Vec{N, T}(ntuple(_ -> rand(T) + T(0.5), N))
-    exp_int = Vec{N, Int32}(ntuple(_ -> rand(Int32.(0:3)), N))
-    @test let
-        pow_result = call_on_device(^, base, exp_int)
-        all(pow_result[i] ≈ base[i] ^ exp_int[i] for i in 1:N)
-    end broken = broken
-
-    rootn_base = Vec{N, T}(ntuple(_ -> rand(T) * T(10) + T(1), N))
-    rootn_n = Vec{N, Int32}(ntuple(_ -> rand(Int32.(2:4)), N))
-    @test call_on_device(OpenCL.rootn, rootn_base, rootn_n) isa Vec{N, T} broken = broken
-
-    # special cases: nan
-    nan_code = Vec{N, Base.uinttype(T)}(ntuple(_ -> rand(Base.uinttype(T)), N))
-    nan_result = call_on_device(OpenCL.nan, nan_code)
-    @test all(isnan(nan_result[i]) for i in 1:N)
-end
+        @testset "SIMD - $N x $T" for N in simd_ns, T in float_types
+            # codegen emits i48 here, which SPIR-V doesn't support
+            # XXX: fix upstream?
+            T == Float16 && N == 3 && continue
+
+            v = Vec{N, T}(ntuple(_ -> rand(T), N))
+
+            # unary ops: sin, cos, sqrt
+            a = call_on_device(sin, v)
+            @test all(a[i] ≈ sin(v[i]) for i in 1:N)
+
+            b = call_on_device(cos, v)
+            @test all(b[i] ≈ cos(v[i]) for i in 1:N)
+
+            c = call_on_device(sqrt, v)
+            @test all(c[i] ≈ sqrt(v[i]) for i in 1:N)
+
+            # binary ops: max, hypot
+            w = Vec{N, T}(ntuple(_ -> rand(T), N))
+            d = call_on_device(max, v, w)
+            @test all(d[i] == max(v[i], w[i]) for i in 1:N)
+
+            broken = ispocl && T == Float16
+            if !broken
+                h = call_on_device(hypot, v, w)
+                @test all(h[i] ≈ hypot(v[i], w[i]) for i in 1:N)
+            end
+
+            # ternary op: fma
+            x = Vec{N, T}(ntuple(_ -> rand(T), N))
+            e = call_on_device(fma, v, w, x)
+            @test all(e[i] ≈ fma(v[i], w[i], x[i]) for i in 1:N)
+
+            # special cases: ilogb, ldexp, ^ with Int32, rootn
+            v_pos = Vec{N, T}(ntuple(_ -> rand(T) + T(1), N))
+            @test call_on_device(OpenCL.ilogb, v_pos) isa Vec{N, Int32} broken = broken
+
+            k = Vec{N, Int32}(ntuple(_ -> rand(Int32.(-5:5)), N))
+            @test let
+                ldexp_result = call_on_device(ldexp, v_pos, k)
+                all(ldexp_result[i] ≈ ldexp(v_pos[i], k[i]) for i in 1:N)
+            end broken = broken
+
+            base = Vec{N, T}(ntuple(_ -> rand(T) + T(0.5), N))
+            exp_int = Vec{N, Int32}(ntuple(_ -> rand(Int32.(0:3)), N))
+            @test let
+                pow_result = call_on_device(^, base, exp_int)
+                all(pow_result[i] ≈ base[i]^exp_int[i] for i in 1:N)
+            end broken = broken
+
+            rootn_base = Vec{N, T}(ntuple(_ -> rand(T) * T(10) + T(1), N))
+            rootn_n = Vec{N, Int32}(ntuple(_ -> rand(Int32.(2:4)), N))
+            @test call_on_device(OpenCL.rootn, rootn_base, rootn_n) isa Vec{N, T} broken = broken
+
+            # special cases: nan
+            nan_code = Vec{N, Base.uinttype(T)}(ntuple(_ -> rand(Base.uinttype(T)), N))
+            nan_result = call_on_device(OpenCL.nan, nan_code)
+            @test all(isnan(nan_result[i]) for i in 1:N)
+        end
 
 end

codecov · 2025-10-10T09:45:48Z

Codecov Report

✅ All modified and coverable lines are covered by tests.
✅ Project coverage is 80.22%. Comparing base (8a1785f) to head (33974d6).
⚠️ Report is 1 commits behind head on master.

Additional details and impacted files

@@            Coverage Diff             @@
##           master     #379      +/-   ##
==========================================
+ Coverage   80.19%   80.22%   +0.02%     
==========================================
  Files          12       12              
  Lines         722      723       +1     
==========================================
+ Hits          579      580       +1     
  Misses        143      143

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@noinline

It seems like the issue is that codegen hard codes `MAX_ALIGN` based on the host platform ABI and assumes that if the host supports `i128` allocas the target will support it as well. For now just handle this by converting `i128` allocas to `<2 x i64>` allocas. Discovered while working on JuliaGPU/OpenCL.jl#379 To reproduce the issue: ```julia-repl julia> using OpenCL, SIMD julia> OpenCL.code_llvm(NTuple{2, Vec{8, Float32}}) do x... @noinline +(x...) end ; @ REPL[7]:2 within `#11` define void @julia__11_16515(ptr noalias nocapture noundef nonnull sret([1 x <8 x float>]) align 16 dereferenceable(32) %sret_return, ptr nocapture noundef nonnull readonly align 16 dereferenceable(32) %"x[1]::Vec", ptr nocapture noundef nonnull readonly align 16 dereferenceable(32) %"x[2]::Vec") local_unnamed_addr { top: %"new::Tuple" = alloca [2 x [1 x <8 x float>]], align 16 %sret_box = alloca [2 x i128], align 16 call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) %"new::Tuple", ptr noundef nonnull align 16 dereferenceable(32) %"x[1]::Vec", i64 32, i1 false) %0 = getelementptr inbounds i8, ptr %"new::Tuple", i64 32 call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) %0, ptr noundef nonnull align 16 dereferenceable(32) %"x[2]::Vec", i64 32, i1 false) call fastcc void @julia___16519(ptr noalias nocapture noundef sret([1 x <8 x float>]) %sret_box, ptr nocapture readonly %"new::Tuple", ptr nocapture readonly %0) call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) %sret_return, ptr noundef nonnull align 16 dereferenceable(32) %sret_box, i64 32, i1 false) ret void } ``` A similar workaround might be needed for Metal, but I don't have a Mac to test

(Removing the overrides doesn't really buy us much either since I am then getting a bunch of miscompilation, likely due to the presence of throw statements) I excempted `hypot` for now, since that actually seems to be used in `GPUArrays`

@noinline

It seems like the issue is that codegen hard codes `MAX_ALIGN` based on the host platform ABI and assumes that if the host supports `i128` allocas the target will support it as well. For now just handle this by converting `i128` allocas to `<2 x i64>` allocas. Discovered while working on JuliaGPU/OpenCL.jl#379 To reproduce the issue: ```julia-repl julia> using OpenCL, SIMD julia> OpenCL.code_llvm(NTuple{2, Vec{8, Float32}}) do x... @noinline +(x...) end ; @ REPL[7]:2 within `#11` define void @julia__11_16515(ptr noalias nocapture noundef nonnull sret([1 x <8 x float>]) align 16 dereferenceable(32) %sret_return, ptr nocapture noundef nonnull readonly align 16 dereferenceable(32) %"x[1]::Vec", ptr nocapture noundef nonnull readonly align 16 dereferenceable(32) %"x[2]::Vec") local_unnamed_addr { top: %"new::Tuple" = alloca [2 x [1 x <8 x float>]], align 16 %sret_box = alloca [2 x i128], align 16 call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) %"new::Tuple", ptr noundef nonnull align 16 dereferenceable(32) %"x[1]::Vec", i64 32, i1 false) %0 = getelementptr inbounds i8, ptr %"new::Tuple", i64 32 call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) %0, ptr noundef nonnull align 16 dereferenceable(32) %"x[2]::Vec", i64 32, i1 false) call fastcc void @julia___16519(ptr noalias nocapture noundef sret([1 x <8 x float>]) %sret_box, ptr nocapture readonly %"new::Tuple", ptr nocapture readonly %0) call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) %sret_return, ptr noundef nonnull align 16 dereferenceable(32) %sret_box, i64 32, i1 false) ret void } ``` A similar workaround might be needed for Metal, but I don't have a Mac to test

maleadt

Interesting!

simeonschaub linked an issue Oct 13, 2025 that may be closed by this pull request

support vectorized math intrinsics #376

Closed

Base automatically changed from sds/float16 to master October 14, 2025 11:21

simeonschaub force-pushed the sds/simd branch from 7e0917b to 8f0f428 Compare November 3, 2025 13:52

simeonschaub mentioned this pull request Nov 4, 2025

[SPIRV] convert i128 allocas to <2 x i64> JuliaGPU/GPUCompiler.jl#734

Merged

simeonschaub added 7 commits November 7, 2025 13:18

mark intrinsics missing in pocl as broken

0640600

(Removing the overrides doesn't really buy us much either since I am then getting a bunch of miscompilation, likely due to the presence of throw statements) I excempted `hypot` for now, since that actually seems to be used in `GPUArrays`

more extensive testing

fdc22be

SIMD support for math intrinsics

ef9817b

fix test failures

782cfe2

mark ldexp as broken for pocl Float16

33fe58e

pown is also broken

9d29a07

add inline annotations

8eb5eb4

simeonschaub force-pushed the sds/simd branch from 8f0f428 to 8eb5eb4 Compare November 7, 2025 12:21

simeonschaub closed this Nov 12, 2025

simeonschaub reopened this Nov 12, 2025

skip 3 x Float16

a6717a7

simeonschaub requested a review from maleadt November 12, 2025 15:14

maleadt approved these changes Nov 13, 2025

View reviewed changes

bump SPIRVIntrinsics version

33974d6

simeonschaub enabled auto-merge (squash) November 13, 2025 09:20

simeonschaub merged commit 11ed992 into master Nov 13, 2025
30 of 37 checks passed

simeonschaub deleted the sds/simd branch November 13, 2025 09:24

This was referenced Nov 19, 2025

[SPIRVIntrinsics] Fix @device_override #404

Merged

Disable broken vectorization tests on Windows and use windows-2022 in CI #405

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

SIMD support for math intrinsics#379

SIMD support for math intrinsics#379
simeonschaub merged 9 commits intomasterfrom
sds/simd

simeonschaub commented Oct 10, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Oct 10, 2025 •

edited

Loading

Uh oh!

codecov bot commented Oct 10, 2025 •

edited

Loading

Uh oh!

maleadt left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Conversation

simeonschaub commented Oct 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Oct 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

codecov bot commented Oct 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

maleadt left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

simeonschaub commented Oct 10, 2025 •

edited

Loading

github-actions bot commented Oct 10, 2025 •

edited

Loading

codecov bot commented Oct 10, 2025 •

edited

Loading