[SPIR-V] Initial support for SPIR-V in `gpuintrin.h` by jhuber6 · Pull Request #174910 · llvm/llvm-project

jhuber6 · 2026-01-08T05:06:07Z

Summary:
#174862 and
#174655 provided the intrinsics
required to get the fundamental operations working for these. This patch
sets up the basic support (as far as I know).

This should be the first step towards allowing SPIR-V to build things
like the LLVM libc and the OpenMP Device Runtime Library. The
implementations here are intentionally inefficient, such as not using
the dedicated SPIR-V opcode for read firstlane. This is just to start
and hopefully start testing things later.

Would appreciate someone more familiar with the backend double-checking
these.

llvmbot · 2026-01-08T05:06:35Z

@llvm/pr-subscribers-backend-x86

Author: Joseph Huber (jhuber6)

Changes

Summary:
#174862 and
#174655 provided the intrinsics
required to get the fundamental operations working for these. This patch
sets up the basic support (as far as I know).

This should be the first step towards allowing SPIR-V to build things
like the LLVM libc and the OpenMP Device Runtime Library. The
implementations here are intentionally inefficient, such as not using
the dedicated SPIR-V opcode for read firstlane. This is just to start
and hopefully start testing things later.

Would appreciate someone more familiar with the backend double-checking
these.

Full diff: https://github.com/llvm/llvm-project/pull/174910.diff

4 Files Affected:

(modified) clang/lib/Headers/CMakeLists.txt (+1)
(modified) clang/lib/Headers/gpuintrin.h (+2)
(added) clang/lib/Headers/spirvintrin.h (+171)
(modified) clang/test/Headers/gpuintrin_lang.c (+19-1)

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 1b96ac417bf70..c92b370b88d2d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -297,6 +297,7 @@ set(gpu_files
   gpuintrin.h
   nvptxintrin.h
   amdgpuintrin.h
+  spirvintrin.h
   )
 
 set(windows_only_files
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 7afc82413996b..30f3667adea73 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
+#elif defined(__SPIRV__)
+#include <spirvintrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
 #endif
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..bf5df70583dc6
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,171 @@
+//===-- spirvintrin.h - SPIR-V intrinsic functions ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV__
+#error "This file is intended for SPIR-V targets or offloading to SPIR-V"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(spirv64)})");
+
+// Type aliases to the address spaces used by the SPIR-V backend.
+#define __gpu_private __attribute__((address_space(0)))
+#define __gpu_constant __attribute__((address_space(2)))
+#define __gpu_local __attribute__((address_space(3)))
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __builtin_spirv_num_workgroups(0);
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __builtin_spirv_num_workgroups(1);
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __builtin_spirv_num_workgroups(2);
+}
+
+// Returns the 'x' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __builtin_spirv_workgroup_id(0);
+}
+
+// Returns the 'y' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __builtin_spirv_workgroup_id(1);
+}
+
+// Returns the 'z' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __builtin_spirv_workgroup_id(2);
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __builtin_spirv_workgroup_size(0);
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __builtin_spirv_workgroup_size(1);
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __builtin_spirv_workgroup_size(2);
+}
+
+// Returns the 'x' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __builtin_spirv_local_invocation_id(0);
+}
+
+// Returns the 'y' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __builtin_spirv_local_invocation_id(1);
+}
+
+// Returns the 'z' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __builtin_spirv_local_invocation_id(2);
+}
+
+// Returns the size of an wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __builtin_spirv_subgroup_size();
+}
+
+// Returns the id of the thread inside of an wavefront executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __builtin_spirv_subgroup_id();
+}
+
+// Returns the bit-mask of active threads in the current wavefront. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+  uint32_t [[clang::ext_vector_type(4)]] __mask =
+      __builtin_spirv_subgroup_ballot(1);
+  return __builtin_bit_cast(uint64_t,
+                            __builtin_shufflevector(__mask, __mask, 0, 1));
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_spirv_subgroup_shuffle(__x,
+                                          __builtin_ctzg(__gpu_lane_mask()));
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads.
+  uint32_t [[clang::ext_vector_type(4)]] __mask =
+      __builtin_spirv_subgroup_ballot(__x);
+  return __lane_mask & __builtin_bit_cast(uint64_t, __builtin_shufflevector(
+                                                        __mask, __mask, 0, 1));
+}
+
+// Wait for all threads in the wavefront to converge, this is a noop on SPIR-V.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __builtin_spirv_subgroup_shuffle(__x, __lane);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#endif // __SPIRVINTRIN_H
diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
index 653f87aea2ce3..e3db72d5ff928 100644
--- a/clang/test/Headers/gpuintrin_lang.c
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -22,6 +22,11 @@
 // RUN:   -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=OPENMP
 //
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -DSYCL \
+// RUN:   -internal-isystem %S/../../lib/Headers/ -fsycl-is-device \
+// RUN:   -x c++ -triple spirv64 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SYCL
+//
 // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
 // RUN:   -std=c89 -internal-isystem %S/../../lib/Headers/ \
 // RUN:   -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
@@ -32,11 +37,13 @@
 
 #ifdef __device__
 __device__ int foo() { return __gpu_thread_id_x(); }
+#elif defined(SYCL)
+extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); }
 #else
 // CUDA-LABEL: define dso_local i32 @foo(
 // CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
 // CUDA-NEXT:  [[ENTRY:.*:]]
-// CUDA-NEXT:    [[TMP0:%.*]] = call {{.*}}i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CUDA-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 // CUDA-NEXT:    ret i32 [[TMP0]]
 //
 // HIP-LABEL: define dso_local i32 @foo(
@@ -61,6 +68,17 @@ __device__ int foo() { return __gpu_thread_id_x(); }
 // OPENMP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 // OPENMP-NEXT:    ret i32 [[TMP0]]
 //
+// SYCL-LABEL: define spir_func i32 @foo(
+// SYCL-SAME: ) #[[ATTR0:[0-9]+]] {
+// SYCL-NEXT:  [[ENTRY:.*:]]
+// SYCL-NEXT:    [[RETVAL_I:%.*]] = alloca i32, align 4
+// SYCL-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SYCL-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4)
+// SYCL-NEXT:    [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr [[RETVAL_I]] to ptr addrspace(4)
+// SYCL-NEXT:    [[SPV_THREAD_ID_IN_GROUP_I:%.*]] = call i64 @llvm.spv.thread.id.in.group.i64(i32 0)
+// SYCL-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPV_THREAD_ID_IN_GROUP_I]] to i32
+// SYCL-NEXT:    ret i32 [[CONV_I]]
+//
 // C89-LABEL: define dso_local i32 @foo(
 // C89-SAME: ) #[[ATTR0:[0-9]+]] {
 // C89-NEXT:  [[ENTRY:.*:]]

llvmbot · 2026-01-08T05:06:36Z

@llvm/pr-subscribers-clang

Author: Joseph Huber (jhuber6)

Changes

Summary:
#174862 and
#174655 provided the intrinsics
required to get the fundamental operations working for these. This patch
sets up the basic support (as far as I know).

This should be the first step towards allowing SPIR-V to build things
like the LLVM libc and the OpenMP Device Runtime Library. The
implementations here are intentionally inefficient, such as not using
the dedicated SPIR-V opcode for read firstlane. This is just to start
and hopefully start testing things later.

Would appreciate someone more familiar with the backend double-checking
these.

Full diff: https://github.com/llvm/llvm-project/pull/174910.diff

4 Files Affected:

(modified) clang/lib/Headers/CMakeLists.txt (+1)
(modified) clang/lib/Headers/gpuintrin.h (+2)
(added) clang/lib/Headers/spirvintrin.h (+171)
(modified) clang/test/Headers/gpuintrin_lang.c (+19-1)

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 1b96ac417bf70..c92b370b88d2d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -297,6 +297,7 @@ set(gpu_files
   gpuintrin.h
   nvptxintrin.h
   amdgpuintrin.h
+  spirvintrin.h
   )
 
 set(windows_only_files
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 7afc82413996b..30f3667adea73 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
+#elif defined(__SPIRV__)
+#include <spirvintrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
 #endif
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..bf5df70583dc6
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,171 @@
+//===-- spirvintrin.h - SPIR-V intrinsic functions ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV__
+#error "This file is intended for SPIR-V targets or offloading to SPIR-V"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {arch(spirv64)})");
+
+// Type aliases to the address spaces used by the SPIR-V backend.
+#define __gpu_private __attribute__((address_space(0)))
+#define __gpu_constant __attribute__((address_space(2)))
+#define __gpu_local __attribute__((address_space(3)))
+#define __gpu_global __attribute__((address_space(1)))
+#define __gpu_generic __attribute__((address_space(4)))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __builtin_spirv_num_workgroups(0);
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __builtin_spirv_num_workgroups(1);
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __builtin_spirv_num_workgroups(2);
+}
+
+// Returns the 'x' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __builtin_spirv_workgroup_id(0);
+}
+
+// Returns the 'y' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __builtin_spirv_workgroup_id(1);
+}
+
+// Returns the 'z' dimension of the current workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __builtin_spirv_workgroup_id(2);
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __builtin_spirv_workgroup_size(0);
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __builtin_spirv_workgroup_size(1);
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __builtin_spirv_workgroup_size(2);
+}
+
+// Returns the 'x' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __builtin_spirv_local_invocation_id(0);
+}
+
+// Returns the 'y' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __builtin_spirv_local_invocation_id(1);
+}
+
+// Returns the 'z' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __builtin_spirv_local_invocation_id(2);
+}
+
+// Returns the size of an wavefront, either 32 or 64 depending on hardware
+// and compilation options.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __builtin_spirv_subgroup_size();
+}
+
+// Returns the id of the thread inside of an wavefront executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __builtin_spirv_subgroup_id();
+}
+
+// Returns the bit-mask of active threads in the current wavefront. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+  uint32_t [[clang::ext_vector_type(4)]] __mask =
+      __builtin_spirv_subgroup_ballot(1);
+  return __builtin_bit_cast(uint64_t,
+                            __builtin_shufflevector(__mask, __mask, 0, 1));
+}
+
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_spirv_subgroup_shuffle(__x,
+                                          __builtin_ctzg(__gpu_lane_mask()));
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true. This
+// implementation is incorrect if the target uses more than 64 lanes.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads.
+  uint32_t [[clang::ext_vector_type(4)]] __mask =
+      __builtin_spirv_subgroup_ballot(__x);
+  return __lane_mask & __builtin_bit_cast(uint64_t, __builtin_shufflevector(
+                                                        __mask, __mask, 0, 1));
+}
+
+// Wait for all threads in the wavefront to converge, this is a noop on SPIR-V.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
+}
+
+// Shuffles the the lanes inside the wavefront according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+  return __builtin_spirv_subgroup_shuffle(__x, __lane);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+#endif // __SPIRVINTRIN_H
diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
index 653f87aea2ce3..e3db72d5ff928 100644
--- a/clang/test/Headers/gpuintrin_lang.c
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -22,6 +22,11 @@
 // RUN:   -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=OPENMP
 //
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -DSYCL \
+// RUN:   -internal-isystem %S/../../lib/Headers/ -fsycl-is-device \
+// RUN:   -x c++ -triple spirv64 -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SYCL
+//
 // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
 // RUN:   -std=c89 -internal-isystem %S/../../lib/Headers/ \
 // RUN:   -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
@@ -32,11 +37,13 @@
 
 #ifdef __device__
 __device__ int foo() { return __gpu_thread_id_x(); }
+#elif defined(SYCL)
+extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); }
 #else
 // CUDA-LABEL: define dso_local i32 @foo(
 // CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
 // CUDA-NEXT:  [[ENTRY:.*:]]
-// CUDA-NEXT:    [[TMP0:%.*]] = call {{.*}}i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CUDA-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 // CUDA-NEXT:    ret i32 [[TMP0]]
 //
 // HIP-LABEL: define dso_local i32 @foo(
@@ -61,6 +68,17 @@ __device__ int foo() { return __gpu_thread_id_x(); }
 // OPENMP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 // OPENMP-NEXT:    ret i32 [[TMP0]]
 //
+// SYCL-LABEL: define spir_func i32 @foo(
+// SYCL-SAME: ) #[[ATTR0:[0-9]+]] {
+// SYCL-NEXT:  [[ENTRY:.*:]]
+// SYCL-NEXT:    [[RETVAL_I:%.*]] = alloca i32, align 4
+// SYCL-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SYCL-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4)
+// SYCL-NEXT:    [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr [[RETVAL_I]] to ptr addrspace(4)
+// SYCL-NEXT:    [[SPV_THREAD_ID_IN_GROUP_I:%.*]] = call i64 @llvm.spv.thread.id.in.group.i64(i32 0)
+// SYCL-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SPV_THREAD_ID_IN_GROUP_I]] to i32
+// SYCL-NEXT:    ret i32 [[CONV_I]]
+//
 // C89-LABEL: define dso_local i32 @foo(
 // C89-SAME: ) #[[ATTR0:[0-9]+]] {
 // C89-NEXT:  [[ENTRY:.*:]]

jhuber6 · 2026-01-08T05:07:44Z

This will fail tests until the dependent PRs are merged. Inspecting the basic IR makes sense but I have no way to test this. Hopefully @sarnex can help here in the future because this should make porting the OpenMP support much easier.

The SPIR-V intrinsics are missing thread syncs, an exit, and the pointer introspections. No clue if I got the address spaces or the thread -> grid accessors right.

github-actions · 2026-01-08T05:30:05Z

🪟 Windows x64 Test Results

53374 tests passed
2214 tests skipped

✅ The build succeeded and all tests passed.

github-actions · 2026-01-08T05:30:06Z

🐧 Linux x64 Test Results

112569 tests passed
4604 tests skipped

✅ The build succeeded and all tests passed.

sarnex

lgtm, but i asked greg from my team to also take a look at this since he's more familiar with what the correct logic should be

sarnex · 2026-01-08T15:57:31Z

clang/lib/Headers/spirvintrin.h

+#define __gpu_generic __attribute__((address_space(4)))
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel __attribute__((device_kernel, visibility("protected")))


maybe we could unify all these and move it to gpuintrin.h and remove it from each target's header since i unified the attrs a while ago?

Yeah, I'll do a pass to simplify that in the future since it applies to the libc code as well.

fineg74 · 2026-01-08T18:26:09Z

Couple of questions:

The implementation does not contain some API that for example nvptxintrin.h and amdgpuintrin.h containe like __gpu_is_ptr_private/__gpu_is_ptr_local/__gpu_sync_threads which are used across the code.
Is there a reason to introduce and use new builtin intrinsics vs existing spirv builtin API (https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/__clang_spirv_builtins.h) like I did here: [OFFLOAD] Build DeviceRTL with SPIRV backend #174675 ?

jhuber6 · 2026-01-08T18:36:04Z

Couple of questions:

1. The implementation does not contain some API that for example nvptxintrin.h and amdgpuintrin.h containe like __gpu_is_ptr_private/__gpu_is_ptr_local/__gpu_sync_threads which are used across the code.

Yes, these aren't intended to be a completely inclusive set. I'm working on exposing OpControlBarrier right now so that will be added. The address space introspection isn't necessarily required. SPIR-V has OpTerminateInvocation but it only applies to Fragment shaders for some reason so there's no real exit solution, similar story with scheduling helpers but that one can safely be a no-op.

2. Is there a reason to introduce and use new builtin intrinsics vs existing spirv builtin API (https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/__clang_spirv_builtins.h) like I did here: [ [OFFLOAD] Build DeviceRTL with SPIRV backend #174675](https://github.com/llvm/llvm-project/pull/174675) ?

I don't know exactly how SPIR-V works. It seems that some things are resolved as external functions and hooked up by some Khronos tool? I'd prefer if we moved away from that now that we have a backend. Correct me if I'm wrong here. The proper way of doing this is always builtins to LLVM backend intrinsics, everything else is more of a temporary hack as far as I'm aware.

The clang_spirv_builtins seems to be more like a level of indirection over whether or not we're using the translator or the backend, while these helpers are more about abstracting the same compute-type behavior across the various GPU targets. I'd prefer we move towards using the backend for everything, but I don't know if that conflicts with some internal Intel goal or something.

Summary: llvm#174862 and llvm#174655 provided the intrinsics required to get the fundamental operations working for these. This patch sets up the basic support (as far as I know). This should be the first step towards allowing SPIR-V to build things like the LLVM libc and the OpenMP Device Runtime Library. The implementations here are intentionally inefficient, such as not using the dedicated SPIR-V opcode for read firstlane. This is just to start and hopefully start testing things later. Would appreciate someone more familiar with the backend double-checking these.

llvm-ci · 2026-01-09T15:10:44Z

LLVM Buildbot has detected a new failure on builder llvm-clang-aarch64-darwin running on doug-worker-4 while building clang at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/33957

Here is the relevant piece of the build log for the reference

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
rm -rf /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp && mkdir -p /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp
# executed command: rm -rf /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp
# note: command had no output on stdout or stderr
# executed command: mkdir -p /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp
# note: command had no output on stdout or stderr
# RUN: at line 2
/Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-mc -triple=arm64-apple-darwin -filetype=obj -o /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.o /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s
# executed command: /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-mc -triple=arm64-apple-darwin -filetype=obj -o /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.o /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s
# .---command stderr------------
# | /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s:16:2: warning: .build_version macos used while targeting darwin
# |         .build_version macos, 26, 0
# |         ^
# `-----------------------------
# RUN: at line 3
not --crash /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-jitlink -debugger-support=false      -write-symtab /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.symtab.txt /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.o      > /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/backtrace.txt 2>&1
# executed command: not --crash /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-jitlink -debugger-support=false -write-symtab /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.symtab.txt /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.o
# note: command had no output on stdout or stderr
# RUN: at line 6
/Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-jitlink -symbolicate-with /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.symtab.txt /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/backtrace.txt      | /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s
# executed command: /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/llvm-jitlink -symbolicate-with /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.symtab.txt /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/backtrace.txt
# note: command had no output on stdout or stderr
# executed command: /Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s
# .---command stderr------------
# | �[1m/Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s:14:10: �[0m�[0;1;31merror: �[0m�[1mCHECK: expected string not found in input
�[0m# | �[1m�[0m# CHECK: this_should_crash {{.*}} ({{.*}}crash.o)
# | �[0;1;32m         ^
�[0m# | �[0;1;32m�[0m�[1m<stdin>:1:1: �[0m�[0;1;30mnote: �[0m�[1mscanning from here
�[0m# | �[1m�[0mPLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace and instructions to reproduce the bug.
# | �[0;1;32m^
�[0m# | �[0;1;32m�[0m�[1m<stdin>:9:161: �[0m�[0;1;30mnote: �[0m�[1mpossible intended match here
�[0m# | �[1m�[0m4 (error) ffff800104804010 _main + 18446603336221196276 (/Volumes/RAMDisk/buildbot-root/aarch64-darwin/build/test/ExecutionEngine/JITLink/AArch64/Output/backtrace-symbolication.s.tmp/crash.o)
# | �[0;1;32m                                                                                                                                                                ^
�[0m# | �[0;1;32m�[0m
# | Input file: <stdin>
# | Check file: /Users/buildbot/buildbot-root/llvm-project/llvm/test/ExecutionEngine/JITLink/AArch64/backtrace-symbolication.s
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# | �[1m�[0m�[0;1;30m            1: �[0m�[1m�[0;1;46mPLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace and instructions to reproduce the bug. �[0m
# | �[0;1;31mcheck:14'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
�[0m# | �[0;1;31m�[0m�[0;1;30m            2: �[0m�[1m�[0;1;46mStack dump: �[0m
# | �[0;1;31mcheck:14'0     ~~~~~~~~~~~~
...

Summary: llvm#174862 and llvm#174655 provided the intrinsics required to get the fundamental operations working for these. This patch sets up the basic support (as far as I know). This should be the first step towards allowing SPIR-V to build things like the LLVM libc and the OpenMP Device Runtime Library. The implementations here are intentionally inefficient, such as not using the dedicated SPIR-V opcode for read firstlane. This is just to start and hopefully start testing things later. Would appreciate someone more familiar with the backend double-checking these.

jhuber6 requested review from AaronBallman, AlexVlx, arsenm, farzonl, michalpaszkowski, sarnex, shiltian and yxsamliu January 8, 2026 05:06

llvmbot added clang Clang issues not falling into any other category backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics labels Jan 8, 2026

sarnex approved these changes Jan 8, 2026

View reviewed changes

farzonl approved these changes Jan 8, 2026

View reviewed changes

jhuber6 force-pushed the spirvintrin branch from 3ead1cd to 5189aef Compare January 8, 2026 22:03

jhuber6 mentioned this pull request Jan 8, 2026

[OFFLOAD] Build DeviceRTL with SPIRV backend #174675

Merged

jhuber6 force-pushed the spirvintrin branch 4 times, most recently from 7a4076d to b4ba797 Compare January 9, 2026 02:00

jhuber6 force-pushed the spirvintrin branch from b4ba797 to 8b5d71e Compare January 9, 2026 02:25

jhuber6 merged commit 5c43243 into llvm:main Jan 9, 2026
10 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SPIR-V] Initial support for SPIR-V in `gpuintrin.h`#174910

[SPIR-V] Initial support for SPIR-V in `gpuintrin.h`#174910
jhuber6 merged 1 commit intollvm:mainfrom
jhuber6:spirvintrin

jhuber6 commented Jan 8, 2026

Uh oh!

llvmbot commented Jan 8, 2026

Uh oh!

llvmbot commented Jan 8, 2026

Uh oh!

jhuber6 commented Jan 8, 2026

Uh oh!

github-actions bot commented Jan 8, 2026 •

edited

Loading

Uh oh!

github-actions bot commented Jan 8, 2026 •

edited

Loading

Uh oh!

sarnex left a comment

Uh oh!

sarnex Jan 8, 2026

Uh oh!

jhuber6 Jan 8, 2026

Uh oh!

fineg74 commented Jan 8, 2026

Uh oh!

jhuber6 commented Jan 8, 2026

Uh oh!

Uh oh!

llvm-ci commented Jan 9, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

Conversation

jhuber6 commented Jan 8, 2026

Uh oh!

llvmbot commented Jan 8, 2026

Uh oh!

llvmbot commented Jan 8, 2026

Uh oh!

jhuber6 commented Jan 8, 2026

Uh oh!

github-actions bot commented Jan 8, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🪟 Windows x64 Test Results

Uh oh!

github-actions bot commented Jan 8, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

Uh oh!

sarnex left a comment

Choose a reason for hiding this comment

Uh oh!

sarnex Jan 8, 2026

Choose a reason for hiding this comment

Uh oh!

jhuber6 Jan 8, 2026

Choose a reason for hiding this comment

Uh oh!

fineg74 commented Jan 8, 2026

Uh oh!

jhuber6 commented Jan 8, 2026

Uh oh!

Uh oh!

llvm-ci commented Jan 9, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

github-actions bot commented Jan 8, 2026 •

edited

Loading

github-actions bot commented Jan 8, 2026 •

edited

Loading