[AMDGPU] Change llvm.amdgcn.image.bvh.intersect.ray to take vec3 args

The ray_origin, ray_dir and ray_inv_dir arguments should all be vec3 to
match how the hardware instruction works.

Don't change the API of the corresponding OpenCL builtins.

Differential Revision: https://reviews.llvm.org/D115032
This commit is contained in:
Jay Foad 2021-12-02 12:26:59 +00:00
parent c8e84c7a5f
commit 2774bad112
7 changed files with 445 additions and 466 deletions

View File

@ -16592,6 +16592,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4));
llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5));
// The builtins take these arguments as vec4 where the last element is
// ignored. The intrinsic takes them as vec3.
RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin,
ArrayRef<int>{0, 1, 2});
RayDir =
Builder.CreateShuffleVector(RayDir, RayDir, ArrayRef<int>{0, 1, 2});
RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir,
ArrayRef<int>{0, 1, 2});
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
{NodePtr->getType(), RayDir->getType()});
return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,

View File

@ -19,7 +19,7 @@ typedef double double4 __attribute__((ext_vector_type(4)));
typedef half half4 __attribute__((ext_vector_type(4)));
typedef uint uint4 __attribute__((ext_vector_type(4)));
// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32
// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f32
// ISA: image_bvh_intersect_ray
void test_image_bvh_intersect_ray(global uint4* out, uint node_ptr,
float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir,
@ -29,7 +29,7 @@ void test_image_bvh_intersect_ray(global uint4* out, uint node_ptr,
ray_origin, ray_dir, ray_inv_dir, texture_descr);
}
// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16
// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f16
// ISA: image_bvh_intersect_ray
void test_image_bvh_intersect_ray_h(global uint4* out, uint node_ptr,
float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir,
@ -39,7 +39,7 @@ void test_image_bvh_intersect_ray_h(global uint4* out, uint node_ptr,
ray_origin, ray_dir, ray_inv_dir, texture_descr);
}
// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32
// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v3f32
// ISA: image_bvh_intersect_ray
void test_image_bvh_intersect_ray_l(global uint4* out, ulong node_ptr,
float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir,
@ -49,7 +49,7 @@ void test_image_bvh_intersect_ray_l(global uint4* out, ulong node_ptr,
ray_origin, ray_dir, ray_inv_dir, texture_descr);
}
// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16
// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v3f16
// ISA: image_bvh_intersect_ray
void test_image_bvh_intersect_ray_lh(global uint4* out, ulong node_ptr,
float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir,

View File

@ -1789,9 +1789,11 @@ def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>;
// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
// <ray_dir>, <ray_inv_dir>, <texture_descr>
// <node_ptr> is i32 or i64.
// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32.
def int_amdgcn_image_bvh_intersect_ray :
Intrinsic<[llvm_v4i32_ty],
[llvm_anyint_ty, llvm_float_ty, llvm_v4f32_ty, llvm_anyvector_ty,
[llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty,
LLVMMatchType<1>, llvm_v4i32_ty],
[IntrReadMem, IntrWillReturn]>;

View File

@ -4869,8 +4869,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
}
Ops.push_back(RayExtent);
auto packLanes = [&Ops, &S32, &B] (Register Src) {
auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
auto packLanes = [&Ops, &S32, &B](Register Src) {
auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
Ops.push_back(Unmerge.getReg(0));
Ops.push_back(Unmerge.getReg(1));
Ops.push_back(Unmerge.getReg(2));
@ -4878,8 +4878,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
packLanes(RayOrigin);
if (IsA16) {
auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
Register R1 = MRI.createGenericVirtualRegister(S32);
Register R2 = MRI.createGenericVirtualRegister(S32);
Register R3 = MRI.createGenericVirtualRegister(S32);

View File

@ -7503,8 +7503,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
assert(NodePtr.getValueType() == MVT::i32 ||
NodePtr.getValueType() == MVT::i64);
assert(RayDir.getValueType() == MVT::v4f16 ||
RayDir.getValueType() == MVT::v4f32);
assert(RayDir.getValueType() == MVT::v3f16 ||
RayDir.getValueType() == MVT::v3f32);
if (!Subtarget->hasGFX10_AEncoding()) {
emitRemovedIntrinsicError(DAG, DL, Op.getValueType());

View File

@ -3,37 +3,25 @@
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s
; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: ; return to shader part epilog
;
; GFX1013-LABEL: image_bvh_intersect_ray:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: v_mov_b32_e32 v5, v6
; GFX1013-NEXT: v_mov_b32_e32 v6, v7
; GFX1013-NEXT: v_mov_b32_e32 v7, v8
; GFX1013-NEXT: v_mov_b32_e32 v8, v10
; GFX1013-NEXT: v_mov_b32_e32 v9, v11
; GFX1013-NEXT: v_mov_b32_e32 v10, v12
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: ; return to shader part epilog
define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh_intersect_ray:
; GCN: ; %bb.0:
; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
@ -44,60 +32,48 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float
; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
%ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
%ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
%ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
%ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
%ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
%ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
%ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
%ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
%ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
%ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh_intersect_ray_a16:
; GCN: ; %bb.0:
; GCN-NEXT: s_mov_b32 s4, 0xffff
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6
; GCN-NEXT: v_and_b32_e32 v10, s4, v8
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_and_b32_e32 v9, s4, v9
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v5
; GCN-NEXT: v_and_b32_e32 v10, s4, v7
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_and_b32_e32 v8, s4, v8
; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_and_or_b32 v5, v6, s4, v5
; GCN-NEXT: v_and_or_b32 v6, v7, s4, v10
; GCN-NEXT: v_lshl_or_b32 v7, v9, 16, v8
; GCN-NEXT: v_lshl_or_b32 v7, v8, 16, v7
; GCN-NEXT: v_and_or_b32 v5, v5, s4, v9
; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10
; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GFX1030-LABEL: image_bvh64_intersect_ray:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: ; return to shader part epilog
;
; GFX1013-LABEL: image_bvh64_intersect_ray:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: v_mov_b32_e32 v6, v7
; GFX1013-NEXT: v_mov_b32_e32 v7, v8
; GFX1013-NEXT: v_mov_b32_e32 v8, v9
; GFX1013-NEXT: v_mov_b32_e32 v9, v11
; GFX1013-NEXT: v_mov_b32_e32 v10, v12
; GFX1013-NEXT: v_mov_b32_e32 v11, v13
; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh64_intersect_ray:
; GCN: ; %bb.0:
; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
@ -109,67 +85,70 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
%ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
%ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
%ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
%ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
%ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
%ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
%ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
%ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
%ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
%ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
%ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh64_intersect_ray_a16:
; GCN: ; %bb.0:
; GCN-NEXT: s_mov_b32 s4, 0xffff
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7
; GCN-NEXT: v_and_b32_e32 v11, s4, v9
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_and_b32_e32 v10, s4, v10
; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v6
; GCN-NEXT: v_and_b32_e32 v11, s4, v8
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_and_b32_e32 v9, s4, v9
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_and_or_b32 v6, v7, s4, v6
; GCN-NEXT: v_and_or_b32 v7, v8, s4, v11
; GCN-NEXT: v_lshl_or_b32 v8, v10, 16, v9
; GCN-NEXT: v_lshl_or_b32 v8, v9, 16, v8
; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10
; GCN-NEXT: v_and_or_b32 v7, v7, s4, v11
; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: v_mov_b32_e32 v18, v0
; GFX1030-NEXT: v_mov_b32_e32 v19, v1
; GFX1030-NEXT: v_mov_b32_e32 v20, v2
; GFX1030-NEXT: v_mov_b32_e32 v21, v3
; GFX1030-NEXT: v_mov_b32_e32 v22, v4
; GFX1030-NEXT: v_mov_b32_e32 v23, v6
; GFX1030-NEXT: v_mov_b32_e32 v24, v7
; GFX1030-NEXT: v_mov_b32_e32 v25, v8
; GFX1030-NEXT: v_mov_b32_e32 v26, v10
; GFX1030-NEXT: v_mov_b32_e32 v27, v11
; GFX1030-NEXT: v_mov_b32_e32 v28, v12
; GFX1030-NEXT: v_mov_b32_e32 v15, v0
; GFX1030-NEXT: v_mov_b32_e32 v16, v1
; GFX1030-NEXT: v_mov_b32_e32 v17, v2
; GFX1030-NEXT: v_mov_b32_e32 v18, v3
; GFX1030-NEXT: v_mov_b32_e32 v19, v4
; GFX1030-NEXT: v_mov_b32_e32 v20, v5
; GFX1030-NEXT: v_mov_b32_e32 v21, v6
; GFX1030-NEXT: v_mov_b32_e32 v22, v7
; GFX1030-NEXT: v_mov_b32_e32 v23, v8
; GFX1030-NEXT: v_mov_b32_e32 v24, v9
; GFX1030-NEXT: v_mov_b32_e32 v25, v10
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
; GFX1030-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GFX1030-NEXT: v_readfirstlane_b32 s4, v14
; GFX1030-NEXT: v_readfirstlane_b32 s5, v15
; GFX1030-NEXT: v_readfirstlane_b32 s6, v16
; GFX1030-NEXT: v_readfirstlane_b32 s7, v17
; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
; GFX1030-NEXT: v_readfirstlane_b32 s4, v11
; GFX1030-NEXT: v_readfirstlane_b32 s5, v12
; GFX1030-NEXT: v_readfirstlane_b32 s6, v13
; GFX1030-NEXT: v_readfirstlane_b32 s7, v14
; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[18:33], s[4:7]
; GFX1030-NEXT: ; implicit-def: $vgpr14_vgpr15
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[15:30], s[4:7]
; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12
; GFX1030-NEXT: ; implicit-def: $vgpr15
; GFX1030-NEXT: ; implicit-def: $vgpr16
; GFX1030-NEXT: ; implicit-def: $vgpr17
; GFX1030-NEXT: ; implicit-def: $vgpr18
; GFX1030-NEXT: ; implicit-def: $vgpr19
; GFX1030-NEXT: ; implicit-def: $vgpr20
@ -178,10 +157,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
; GFX1030-NEXT: ; implicit-def: $vgpr23
; GFX1030-NEXT: ; implicit-def: $vgpr24
; GFX1030-NEXT: ; implicit-def: $vgpr25
; GFX1030-NEXT: ; implicit-def: $vgpr26
; GFX1030-NEXT: ; implicit-def: $vgpr27
; GFX1030-NEXT: ; implicit-def: $vgpr28
; GFX1030-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17
; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1030-NEXT: s_cbranch_execnz .LBB6_1
; GFX1030-NEXT: ; %bb.2:
@ -191,28 +167,24 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
;
; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: v_mov_b32_e32 v5, v6
; GFX1013-NEXT: v_mov_b32_e32 v6, v7
; GFX1013-NEXT: v_mov_b32_e32 v7, v8
; GFX1013-NEXT: v_mov_b32_e32 v8, v10
; GFX1013-NEXT: v_mov_b32_e32 v9, v11
; GFX1013-NEXT: v_mov_b32_e32 v10, v12
; GFX1013-NEXT: v_mov_b32_e32 v18, v14
; GFX1013-NEXT: v_mov_b32_e32 v19, v15
; GFX1013-NEXT: v_mov_b32_e32 v16, v11
; GFX1013-NEXT: v_mov_b32_e32 v17, v12
; GFX1013-NEXT: v_mov_b32_e32 v18, v13
; GFX1013-NEXT: v_mov_b32_e32 v19, v14
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GFX1013-NEXT: v_readfirstlane_b32 s4, v18
; GFX1013-NEXT: v_readfirstlane_b32 s5, v19
; GFX1013-NEXT: v_readfirstlane_b32 s6, v16
; GFX1013-NEXT: v_readfirstlane_b32 s7, v17
; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
; GFX1013-NEXT: v_readfirstlane_b32 s4, v16
; GFX1013-NEXT: v_readfirstlane_b32 s5, v17
; GFX1013-NEXT: v_readfirstlane_b32 s6, v18
; GFX1013-NEXT: v_readfirstlane_b32 s7, v19
; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
; GFX1013-NEXT: image_bvh_intersect_ray v[20:23], v[0:15], s[4:7]
; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17
; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX1013-NEXT: ; implicit-def: $vgpr18_vgpr19
; GFX1013-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17
; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB6_1
@ -224,15 +196,186 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
; GFX1013-NEXT: v_mov_b32_e32 v2, v22
; GFX1013-NEXT: v_mov_b32_e32 v3, v23
; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_mov_b32 s0, 0xffff
; GFX1030-NEXT: v_mov_b32_e32 v13, v0
; GFX1030-NEXT: v_mov_b32_e32 v14, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5
; GFX1030-NEXT: v_and_b32_e32 v1, s0, v7
; GFX1030-NEXT: v_mov_b32_e32 v15, v2
; GFX1030-NEXT: v_mov_b32_e32 v16, v3
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v7
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX1030-NEXT: v_and_b32_e32 v3, s0, v8
; GFX1030-NEXT: v_mov_b32_e32 v17, v4
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
; GFX1030-NEXT: v_and_or_b32 v18, v5, s0, v0
; GFX1030-NEXT: v_and_or_b32 v19, v6, s0, v1
; GFX1030-NEXT: v_lshl_or_b32 v20, v3, 16, v2
; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX1030-NEXT: v_readfirstlane_b32 s4, v9
; GFX1030-NEXT: v_readfirstlane_b32 s5, v10
; GFX1030-NEXT: v_readfirstlane_b32 s6, v11
; GFX1030-NEXT: v_readfirstlane_b32 s7, v12
; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[13:20], s[4:7] a16
; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1030-NEXT: ; implicit-def: $vgpr13
; GFX1030-NEXT: ; implicit-def: $vgpr14
; GFX1030-NEXT: ; implicit-def: $vgpr15
; GFX1030-NEXT: ; implicit-def: $vgpr16
; GFX1030-NEXT: ; implicit-def: $vgpr17
; GFX1030-NEXT: ; implicit-def: $vgpr18
; GFX1030-NEXT: ; implicit-def: $vgpr19
; GFX1030-NEXT: ; implicit-def: $vgpr20
; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1030-NEXT: s_cbranch_execnz .LBB7_1
; GFX1030-NEXT: ; %bb.2:
; GFX1030-NEXT: s_mov_b32 exec_lo, s1
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: ; return to shader part epilog
;
; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: s_mov_b32 s0, 0xffff
; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5
; GFX1013-NEXT: v_and_b32_e32 v14, s0, v7
; GFX1013-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX1013-NEXT: v_and_b32_e32 v8, s0, v8
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX1013-NEXT: v_lshl_or_b32 v7, v8, 16, v7
; GFX1013-NEXT: v_and_or_b32 v5, v5, s0, v13
; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v14
; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX1013-NEXT: v_readfirstlane_b32 s4, v9
; GFX1013-NEXT: v_readfirstlane_b32 s5, v10
; GFX1013-NEXT: v_readfirstlane_b32 s6, v11
; GFX1013-NEXT: v_readfirstlane_b32 s7, v12
; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
; GFX1013-NEXT: image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16
; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB7_1
; GFX1013-NEXT: ; %bb.2:
; GFX1013-NEXT: s_mov_b32 exec_lo, s1
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, v13
; GFX1013-NEXT: v_mov_b32_e32 v1, v14
; GFX1013-NEXT: v_mov_b32_e32 v2, v15
; GFX1013-NEXT: v_mov_b32_e32 v3, v16
; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: v_mov_b32_e32 v16, v0
; GFX1030-NEXT: v_mov_b32_e32 v17, v1
; GFX1030-NEXT: v_mov_b32_e32 v18, v2
; GFX1030-NEXT: v_mov_b32_e32 v19, v3
; GFX1030-NEXT: v_mov_b32_e32 v20, v4
; GFX1030-NEXT: v_mov_b32_e32 v21, v5
; GFX1030-NEXT: v_mov_b32_e32 v22, v6
; GFX1030-NEXT: v_mov_b32_e32 v23, v7
; GFX1030-NEXT: v_mov_b32_e32 v24, v8
; GFX1030-NEXT: v_mov_b32_e32 v25, v9
; GFX1030-NEXT: v_mov_b32_e32 v26, v10
; GFX1030-NEXT: v_mov_b32_e32 v27, v11
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
; GFX1030-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1030-NEXT: v_readfirstlane_b32 s4, v12
; GFX1030-NEXT: v_readfirstlane_b32 s5, v13
; GFX1030-NEXT: v_readfirstlane_b32 s6, v14
; GFX1030-NEXT: v_readfirstlane_b32 s7, v15
; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[16:31], s[4:7]
; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13
; GFX1030-NEXT: ; implicit-def: $vgpr16
; GFX1030-NEXT: ; implicit-def: $vgpr17
; GFX1030-NEXT: ; implicit-def: $vgpr18
; GFX1030-NEXT: ; implicit-def: $vgpr19
; GFX1030-NEXT: ; implicit-def: $vgpr20
; GFX1030-NEXT: ; implicit-def: $vgpr21
; GFX1030-NEXT: ; implicit-def: $vgpr22
; GFX1030-NEXT: ; implicit-def: $vgpr23
; GFX1030-NEXT: ; implicit-def: $vgpr24
; GFX1030-NEXT: ; implicit-def: $vgpr25
; GFX1030-NEXT: ; implicit-def: $vgpr26
; GFX1030-NEXT: ; implicit-def: $vgpr27
; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15
; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1030-NEXT: s_cbranch_execnz .LBB8_1
; GFX1030-NEXT: ; %bb.2:
; GFX1030-NEXT: s_mov_b32 exec_lo, s1
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: ; return to shader part epilog
;
; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: v_mov_b32_e32 v16, v12
; GFX1013-NEXT: v_mov_b32_e32 v17, v13
; GFX1013-NEXT: v_mov_b32_e32 v18, v14
; GFX1013-NEXT: v_mov_b32_e32 v19, v15
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1013-NEXT: v_readfirstlane_b32 s4, v16
; GFX1013-NEXT: v_readfirstlane_b32 s5, v17
; GFX1013-NEXT: v_readfirstlane_b32 s6, v18
; GFX1013-NEXT: v_readfirstlane_b32 s7, v19
; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
; GFX1013-NEXT: image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7]
; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17
; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB8_1
; GFX1013-NEXT: ; %bb.2:
; GFX1013-NEXT: s_mov_b32 exec_lo, s1
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, v20
; GFX1013-NEXT: v_mov_b32_e32 v1, v21
; GFX1013-NEXT: v_mov_b32_e32 v2, v22
; GFX1013-NEXT: v_mov_b32_e32 v3, v23
; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_mov_b32 s0, 0xffff
; GFX1030-NEXT: v_mov_b32_e32 v14, v0
; GFX1030-NEXT: v_mov_b32_e32 v15, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6
@ -244,11 +387,12 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX1030-NEXT: v_and_b32_e32 v3, s0, v9
; GFX1030-NEXT: v_mov_b32_e32 v18, v4
; GFX1030-NEXT: v_mov_b32_e32 v19, v5
; GFX1030-NEXT: v_and_or_b32 v20, v6, s0, v0
; GFX1030-NEXT: v_and_or_b32 v21, v7, s0, v1
; GFX1030-NEXT: v_lshl_or_b32 v22, v3, 16, v2
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
; GFX1030-NEXT: v_and_or_b32 v19, v6, s0, v0
; GFX1030-NEXT: v_and_or_b32 v20, v7, s0, v1
; GFX1030-NEXT: v_lshl_or_b32 v21, v3, 16, v2
; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GFX1030-NEXT: v_readfirstlane_b32 s4, v10
; GFX1030-NEXT: v_readfirstlane_b32 s5, v11
; GFX1030-NEXT: v_readfirstlane_b32 s6, v12
@ -257,7 +401,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[14:21], s[4:7] a16
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[14:29], s[4:7] a16
; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11
; GFX1030-NEXT: ; implicit-def: $vgpr14
; GFX1030-NEXT: ; implicit-def: $vgpr15
@ -267,185 +411,9 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
; GFX1030-NEXT: ; implicit-def: $vgpr19
; GFX1030-NEXT: ; implicit-def: $vgpr20
; GFX1030-NEXT: ; implicit-def: $vgpr21
; GFX1030-NEXT: ; implicit-def: $vgpr22
; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1030-NEXT: s_cbranch_execnz .LBB7_1
; GFX1030-NEXT: ; %bb.2:
; GFX1030-NEXT: s_mov_b32 exec_lo, s1
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: ; return to shader part epilog
;
; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: s_mov_b32 s0, 0xffff
; GFX1013-NEXT: v_lshrrev_b32_e32 v5, 16, v6
; GFX1013-NEXT: v_and_b32_e32 v14, s0, v8
; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX1013-NEXT: v_and_or_b32 v5, v6, s0, v5
; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v14
; GFX1013-NEXT: v_lshl_or_b32 v7, v9, 16, v8
; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX1013-NEXT: v_readfirstlane_b32 s4, v10
; GFX1013-NEXT: v_readfirstlane_b32 s5, v11
; GFX1013-NEXT: v_readfirstlane_b32 s6, v12
; GFX1013-NEXT: v_readfirstlane_b32 s7, v13
; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
; GFX1013-NEXT: image_bvh_intersect_ray v[14:17], v[0:7], s[4:7] a16
; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11
; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB7_1
; GFX1013-NEXT: ; %bb.2:
; GFX1013-NEXT: s_mov_b32 exec_lo, s1
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, v14
; GFX1013-NEXT: v_mov_b32_e32 v1, v15
; GFX1013-NEXT: v_mov_b32_e32 v2, v16
; GFX1013-NEXT: v_mov_b32_e32 v3, v17
; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: v_mov_b32_e32 v19, v0
; GFX1030-NEXT: v_mov_b32_e32 v20, v1
; GFX1030-NEXT: v_mov_b32_e32 v21, v2
; GFX1030-NEXT: v_mov_b32_e32 v22, v3
; GFX1030-NEXT: v_mov_b32_e32 v23, v4
; GFX1030-NEXT: v_mov_b32_e32 v24, v5
; GFX1030-NEXT: v_mov_b32_e32 v25, v7
; GFX1030-NEXT: v_mov_b32_e32 v26, v8
; GFX1030-NEXT: v_mov_b32_e32 v27, v9
; GFX1030-NEXT: v_mov_b32_e32 v28, v11
; GFX1030-NEXT: v_mov_b32_e32 v29, v12
; GFX1030-NEXT: v_mov_b32_e32 v30, v13
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
; GFX1030-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1030-NEXT: v_readfirstlane_b32 s4, v15
; GFX1030-NEXT: v_readfirstlane_b32 s5, v16
; GFX1030-NEXT: v_readfirstlane_b32 s6, v17
; GFX1030-NEXT: v_readfirstlane_b32 s7, v18
; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[19:34], s[4:7]
; GFX1030-NEXT: ; implicit-def: $vgpr15_vgpr16
; GFX1030-NEXT: ; implicit-def: $vgpr19
; GFX1030-NEXT: ; implicit-def: $vgpr20
; GFX1030-NEXT: ; implicit-def: $vgpr21
; GFX1030-NEXT: ; implicit-def: $vgpr22
; GFX1030-NEXT: ; implicit-def: $vgpr23
; GFX1030-NEXT: ; implicit-def: $vgpr24
; GFX1030-NEXT: ; implicit-def: $vgpr25
; GFX1030-NEXT: ; implicit-def: $vgpr26
; GFX1030-NEXT: ; implicit-def: $vgpr27
; GFX1030-NEXT: ; implicit-def: $vgpr28
; GFX1030-NEXT: ; implicit-def: $vgpr29
; GFX1030-NEXT: ; implicit-def: $vgpr30
; GFX1030-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18
; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1030-NEXT: s_cbranch_execnz .LBB8_1
; GFX1030-NEXT: ; %bb.2:
; GFX1030-NEXT: s_mov_b32 exec_lo, s1
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: ; return to shader part epilog
;
; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: v_mov_b32_e32 v6, v7
; GFX1013-NEXT: v_mov_b32_e32 v7, v8
; GFX1013-NEXT: v_mov_b32_e32 v8, v9
; GFX1013-NEXT: v_mov_b32_e32 v9, v11
; GFX1013-NEXT: v_mov_b32_e32 v10, v12
; GFX1013-NEXT: v_mov_b32_e32 v11, v13
; GFX1013-NEXT: v_mov_b32_e32 v19, v15
; GFX1013-NEXT: v_mov_b32_e32 v20, v16
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1013-NEXT: v_readfirstlane_b32 s4, v19
; GFX1013-NEXT: v_readfirstlane_b32 s5, v20
; GFX1013-NEXT: v_readfirstlane_b32 s6, v17
; GFX1013-NEXT: v_readfirstlane_b32 s7, v18
; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20]
; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX1013-NEXT: s_and_saveexec_b32 s0, s0
; GFX1013-NEXT: image_bvh64_intersect_ray v[21:24], v[0:15], s[4:7]
; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX1013-NEXT: ; implicit-def: $vgpr19_vgpr20
; GFX1013-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18
; GFX1013-NEXT: s_waitcnt_depctr 0xffe3
; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1013-NEXT: s_cbranch_execnz .LBB8_1
; GFX1013-NEXT: ; %bb.2:
; GFX1013-NEXT: s_mov_b32 exec_lo, s1
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, v21
; GFX1013-NEXT: v_mov_b32_e32 v1, v22
; GFX1013-NEXT: v_mov_b32_e32 v2, v23
; GFX1013-NEXT: v_mov_b32_e32 v3, v24
; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_mov_b32 s0, 0xffff
; GFX1030-NEXT: v_mov_b32_e32 v15, v0
; GFX1030-NEXT: v_mov_b32_e32 v16, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v7
; GFX1030-NEXT: v_and_b32_e32 v1, s0, v9
; GFX1030-NEXT: v_mov_b32_e32 v17, v2
; GFX1030-NEXT: v_mov_b32_e32 v18, v3
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v9
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX1030-NEXT: v_and_b32_e32 v3, s0, v10
; GFX1030-NEXT: v_mov_b32_e32 v19, v4
; GFX1030-NEXT: v_mov_b32_e32 v20, v5
; GFX1030-NEXT: v_and_or_b32 v21, v7, s0, v0
; GFX1030-NEXT: v_and_or_b32 v22, v8, s0, v1
; GFX1030-NEXT: v_lshl_or_b32 v23, v3, 16, v2
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GFX1030-NEXT: v_readfirstlane_b32 s4, v11
; GFX1030-NEXT: v_readfirstlane_b32 s5, v12
; GFX1030-NEXT: v_readfirstlane_b32 s6, v13
; GFX1030-NEXT: v_readfirstlane_b32 s7, v14
; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo
; GFX1030-NEXT: s_and_saveexec_b32 s0, s0
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[15:30], s[4:7] a16
; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12
; GFX1030-NEXT: ; implicit-def: $vgpr15
; GFX1030-NEXT: ; implicit-def: $vgpr16
; GFX1030-NEXT: ; implicit-def: $vgpr17
; GFX1030-NEXT: ; implicit-def: $vgpr18
; GFX1030-NEXT: ; implicit-def: $vgpr19
; GFX1030-NEXT: ; implicit-def: $vgpr20
; GFX1030-NEXT: ; implicit-def: $vgpr21
; GFX1030-NEXT: ; implicit-def: $vgpr22
; GFX1030-NEXT: ; implicit-def: $vgpr23
; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1030-NEXT: s_cbranch_execnz .LBB9_1
; GFX1030-NEXT: ; %bb.2:
; GFX1030-NEXT: s_mov_b32 exec_lo, s1
@ -455,20 +423,20 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: s_mov_b32 s0, 0xffff
; GFX1013-NEXT: v_mov_b32_e32 v16, v11
; GFX1013-NEXT: v_lshrrev_b32_e32 v6, 16, v7
; GFX1013-NEXT: v_and_b32_e32 v11, s0, v9
; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX1013-NEXT: v_and_b32_e32 v10, s0, v10
; GFX1013-NEXT: v_mov_b32_e32 v17, v12
; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX1013-NEXT: v_mov_b32_e32 v16, v10
; GFX1013-NEXT: v_mov_b32_e32 v17, v11
; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6
; GFX1013-NEXT: v_and_b32_e32 v11, s0, v8
; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9
; GFX1013-NEXT: v_mov_b32_e32 v18, v12
; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX1013-NEXT: v_mov_b32_e32 v18, v13
; GFX1013-NEXT: v_mov_b32_e32 v19, v14
; GFX1013-NEXT: v_mov_b32_e32 v19, v13
; GFX1013-NEXT: v_lshl_or_b32 v8, v9, 16, v8
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v6
; GFX1013-NEXT: v_and_or_b32 v7, v8, s0, v11
; GFX1013-NEXT: v_lshl_or_b32 v8, v10, 16, v9
; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v10
; GFX1013-NEXT: v_and_or_b32 v7, v7, s0, v11
; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GFX1013-NEXT: v_readfirstlane_b32 s4, v16
; GFX1013-NEXT: v_readfirstlane_b32 s5, v17
@ -493,7 +461,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
; GFX1013-NEXT: v_mov_b32_e32 v2, v22
; GFX1013-NEXT: v_mov_b32_e32 v3, v23
; GFX1013-NEXT: ; return to shader part epilog
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
@ -567,16 +535,16 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
%node_ptr = load i32, i32* %gep_node_ptr, align 4
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
%ray_extent = load float, float* %gep_ray, align 4
%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
%ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
%ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
%ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
%ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
%ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
%ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
%ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
%ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
store <4 x i32> %v, <4 x i32>* undef
ret void
}
@ -680,16 +648,16 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
%node_ptr = load i32, i32* %gep_node_ptr, align 4
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
%ray_extent = load float, float* %gep_ray, align 4
%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
%ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
%ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
%ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
%ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
%ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
%ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
%ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
%ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
%ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
%ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
store <4 x i32> %v, <4 x i32>* undef
ret void
}
@ -755,16 +723,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
%ray_extent = load float, float* %gep_ray, align 4
%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
%ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
%ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
%ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
%ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
%ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
%ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
%ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
%ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
store <4 x i32> %v, <4 x i32>* undef
ret void
}
@ -860,16 +828,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
%ray_extent = load float, float* %gep_ray, align 4
%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
%ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
%ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
%ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
%ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
%ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
%ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
%ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
%ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
%ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
%ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
store <4 x i32> %v, <4 x i32>* undef
ret void
}

View File

@ -3,15 +3,15 @@
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
; RUN: not --crash llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
; Arguments are flattened to represent the actual VGPR_A layout, so we have no
@ -23,43 +23,43 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
%ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
%ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
%ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
%ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
%ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
%ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
%ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
%ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
%ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
%ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh_intersect_ray_a16:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_lshr_b32 s5, s8, 16
; GCN-NEXT: s_pack_ll_b32_b16 s7, s7, s8
; GCN-NEXT: s_pack_ll_b32_b16 s5, s5, s9
; GCN-NEXT: s_mov_b32 s15, s12
; GCN-NEXT: s_mov_b32 s12, s9
; GCN-NEXT: s_lshr_b32 s9, s7, 16
; GCN-NEXT: s_pack_ll_b32_b16 s6, s6, s7
; GCN-NEXT: s_pack_ll_b32_b16 s7, s9, s8
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: v_mov_b32_e32 v6, s7
; GCN-NEXT: v_mov_b32_e32 v7, s5
; GCN-NEXT: s_mov_b32 s15, s13
; GCN-NEXT: s_mov_b32 s14, s12
; GCN-NEXT: s_mov_b32 s13, s11
; GCN-NEXT: s_mov_b32 s12, s10
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s7
; GCN-NEXT: s_mov_b32 s14, s11
; GCN-NEXT: s_mov_b32 s13, s10
; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
@ -74,44 +74,44 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec,
; GCN-NEXT: ; return to shader part epilog
main_body:
%node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
%ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
%ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
%ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
%ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
%ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
%ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
%ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
%ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
%ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
%ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
%ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh64_intersect_ray_a16:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_lshr_b32 s6, s9, 16
; GCN-NEXT: s_pack_ll_b32_b16 s8, s8, s9
; GCN-NEXT: s_pack_ll_b32_b16 s6, s6, s10
; GCN-NEXT: s_mov_b32 s14, s12
; GCN-NEXT: s_mov_b32 s12, s10
; GCN-NEXT: s_lshr_b32 s10, s8, 16
; GCN-NEXT: s_pack_ll_b32_b16 s7, s7, s8
; GCN-NEXT: s_pack_ll_b32_b16 s8, s10, s9
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: v_mov_b32_e32 v6, s7
; GCN-NEXT: v_mov_b32_e32 v7, s8
; GCN-NEXT: v_mov_b32_e32 v8, s6
; GCN-NEXT: s_mov_b32 s15, s14
; GCN-NEXT: s_mov_b32 s14, s13
; GCN-NEXT: s_mov_b32 s13, s12
; GCN-NEXT: s_mov_b32 s12, s11
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s7
; GCN-NEXT: v_mov_b32_e32 v8, s8
; GCN-NEXT: s_mov_b32 s15, s13
; GCN-NEXT: s_mov_b32 s13, s11
; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
%r = bitcast <4 x i32> %v to <4 x float>
ret <4 x float> %r
}
@ -178,16 +178,16 @@ main_body:
%node_ptr = load i32, i32* %gep_node_ptr, align 4
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
%ray_extent = load float, float* %gep_ray, align 4
%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
%ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
%ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
%ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
%ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
%ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
%ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
%ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
%ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
store <4 x i32> %v, <4 x i32>* undef
ret void
}
@ -246,16 +246,16 @@ main_body:
%node_ptr = load i32, i32* %gep_node_ptr, align 4
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
%ray_extent = load float, float* %gep_ray, align 4
%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
%ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
%ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
%ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
%ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
%ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
%ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
%ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
%ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
%ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
%ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
store <4 x i32> %v, <4 x i32>* undef
ret void
}
@ -316,16 +316,16 @@ main_body:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
%ray_extent = load float, float* %gep_ray, align 4
%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
%ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
%ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
%ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
%ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
%ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
%ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
%ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
%ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
%ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
%ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
store <4 x i32> %v, <4 x i32>* undef
ret void
}
@ -380,16 +380,16 @@ main_body:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
%ray_extent = load float, float* %gep_ray, align 4
%ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
%ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
%ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
%ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
%ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
%ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
%ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
%ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
%ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
%ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
%ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
%ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
%ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
%ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
%ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
store <4 x i32> %v, <4 x i32>* undef
ret void
}