[AMDGPU] Change llvm.amdgcn.image.bvh.intersect.ray to take vec3 args

The ray_origin, ray_dir and ray_inv_dir arguments should all be vec3 to match how the hardware instruction works. Don't change the API of the corresponding OpenCL builtins. Differential Revision: https://reviews.llvm.org/D115032
2021-12-02 12:26:59 +00:00 · 2021-12-02 12:26:59 +00:00 · 2774bad112
parent c8e84c7a5f
commit 2774bad112
7 changed files with 445 additions and 466 deletions
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@ -16592,6 +16592,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
    llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4));
    llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5));

+    // The builtins take these arguments as vec4 where the last element is
+    // ignored. The intrinsic takes them as vec3.
+    RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin,
+                                            ArrayRef<int>{0, 1, 2});
+    RayDir =
+        Builder.CreateShuffleVector(RayDir, RayDir, ArrayRef<int>{0, 1, 2});
+    RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir,
+                                                ArrayRef<int>{0, 1, 2});
+
    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
                                   {NodePtr->getType(), RayDir->getType()});
    return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
@ -19,7 +19,7 @@ typedef double double4 __attribute__((ext_vector_type(4)));
 typedef half half4 __attribute__((ext_vector_type(4)));
 typedef uint uint4 __attribute__((ext_vector_type(4)));

-// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32
+// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f32
 // ISA: image_bvh_intersect_ray
 void test_image_bvh_intersect_ray(global uint4* out, uint node_ptr,
  float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir,
@ -29,7 +29,7 @@ void test_image_bvh_intersect_ray(global uint4* out, uint node_ptr,
           ray_origin, ray_dir, ray_inv_dir, texture_descr);
 }

-// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16
+// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f16
 // ISA: image_bvh_intersect_ray
 void test_image_bvh_intersect_ray_h(global uint4* out, uint node_ptr,
  float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir,
@ -39,7 +39,7 @@ void test_image_bvh_intersect_ray_h(global uint4* out, uint node_ptr,
           ray_origin, ray_dir, ray_inv_dir, texture_descr);
 }

-// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32
+// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v3f32
 // ISA: image_bvh_intersect_ray
 void test_image_bvh_intersect_ray_l(global uint4* out, ulong node_ptr,
  float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir,
@ -49,7 +49,7 @@ void test_image_bvh_intersect_ray_l(global uint4* out, ulong node_ptr,
           ray_origin, ray_dir, ray_inv_dir, texture_descr);
 }

-// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16
+// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v3f16
 // ISA: image_bvh_intersect_ray
 void test_image_bvh_intersect_ray_lh(global uint4* out, ulong node_ptr,
  float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir,
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@ -1789,9 +1789,11 @@ def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>;

 // uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
 //                                           <ray_dir>, <ray_inv_dir>, <texture_descr>
+// <node_ptr> is i32 or i64.
+// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32.
 def int_amdgcn_image_bvh_intersect_ray :
  Intrinsic<[llvm_v4i32_ty],
-            [llvm_anyint_ty, llvm_float_ty, llvm_v4f32_ty, llvm_anyvector_ty,
+            [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty,
             LLVMMatchType<1>, llvm_v4i32_ty],
            [IntrReadMem, IntrWillReturn]>;

--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@ -4869,8 +4869,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
  }
  Ops.push_back(RayExtent);

-  auto packLanes = [&Ops, &S32, &B] (Register Src) {
-    auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
+  auto packLanes = [&Ops, &S32, &B](Register Src) {
+    auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
    Ops.push_back(Unmerge.getReg(0));
    Ops.push_back(Unmerge.getReg(1));
    Ops.push_back(Unmerge.getReg(2));
@ -4878,8 +4878,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,

  packLanes(RayOrigin);
  if (IsA16) {
-    auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
-    auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
+    auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
+    auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
    Register R1 = MRI.createGenericVirtualRegister(S32);
    Register R2 = MRI.createGenericVirtualRegister(S32);
    Register R3 = MRI.createGenericVirtualRegister(S32);
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -7503,8 +7503,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

    assert(NodePtr.getValueType() == MVT::i32 ||
           NodePtr.getValueType() == MVT::i64);
-    assert(RayDir.getValueType() == MVT::v4f16 ||
-           RayDir.getValueType() == MVT::v4f32);
+    assert(RayDir.getValueType() == MVT::v3f16 ||
+           RayDir.getValueType() == MVT::v3f32);

    if (!Subtarget->hasGFX10_AEncoding()) {
      emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@ -3,37 +3,25 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s
 ; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s

-; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
-; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
-; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
-; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)

-declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
-declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
-declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
-declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
 declare i32 @llvm.amdgcn.workitem.id.x()

-define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
-; GFX1030-LABEL: image_bvh_intersect_ray:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    ; return to shader part epilog
-;
-; GFX1013-LABEL: image_bvh_intersect_ray:
-; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    v_mov_b32_e32 v5, v6
-; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
-; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
-; GFX1013-NEXT:    v_mov_b32_e32 v8, v10
-; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
-; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
-; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
-; GFX1013-NEXT:    s_waitcnt vmcnt(0)
-; GFX1013-NEXT:    ; return to shader part epilog
+define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
+; GCN-LABEL: image_bvh_intersect_ray:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
 }
@ -44,60 +32,48 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float
 ; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
-  %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
-  %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
-  %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
-  %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
-  %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
-  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
-  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
+  %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
+  %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
+  %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
+  %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
+  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
+  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
 %r = bitcast <4 x i32> %v to <4 x float>
 ret <4 x float> %r
 }

-define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
+define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh_intersect_ray_a16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s4, 0xffff
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v10, s4, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_and_b32_e32 v9, s4, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v10, s4, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v8, s4, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_and_or_b32 v5, v6, s4, v5
-; GCN-NEXT:    v_and_or_b32 v6, v7, s4, v10
-; GCN-NEXT:    v_lshl_or_b32 v7, v9, 16, v8
+; GCN-NEXT:    v_lshl_or_b32 v7, v8, 16, v7
+; GCN-NEXT:    v_and_or_b32 v5, v5, s4, v9
+; GCN-NEXT:    v_and_or_b32 v6, v6, s4, v10
 ; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
 }

-define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
-; GFX1030-LABEL: image_bvh64_intersect_ray:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    ; return to shader part epilog
-;
-; GFX1013-LABEL: image_bvh64_intersect_ray:
-; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
-; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
-; GFX1013-NEXT:    v_mov_b32_e32 v8, v9
-; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
-; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
-; GFX1013-NEXT:    v_mov_b32_e32 v11, v13
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
-; GFX1013-NEXT:    s_waitcnt vmcnt(0)
-; GFX1013-NEXT:    ; return to shader part epilog
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
+; GCN-LABEL: image_bvh64_intersect_ray:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
 }
@ -109,67 +85,70 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
  %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
-  %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
-  %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
-  %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
-  %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
-  %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
-  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
-  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
+  %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
+  %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
+  %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
+  %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
+  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
+  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
 %r = bitcast <4 x i32> %v to <4 x float>
 ret <4 x float> %r
 }

-define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
+define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh64_intersect_ray_a16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s4, 0xffff
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v11, s4, v9
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_and_b32_e32 v10, s4, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v11, s4, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    v_and_b32_e32 v9, s4, v9
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_and_or_b32 v6, v7, s4, v6
-; GCN-NEXT:    v_and_or_b32 v7, v8, s4, v11
-; GCN-NEXT:    v_lshl_or_b32 v8, v10, 16, v9
+; GCN-NEXT:    v_lshl_or_b32 v8, v9, 16, v8
+; GCN-NEXT:    v_and_or_b32 v6, v6, s4, v10
+; GCN-NEXT:    v_and_or_b32 v7, v7, s4, v11
 ; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
 }

-define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
+define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) {
 ; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr:
 ; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    v_mov_b32_e32 v18, v0
-; GFX1030-NEXT:    v_mov_b32_e32 v19, v1
-; GFX1030-NEXT:    v_mov_b32_e32 v20, v2
-; GFX1030-NEXT:    v_mov_b32_e32 v21, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v22, v4
-; GFX1030-NEXT:    v_mov_b32_e32 v23, v6
-; GFX1030-NEXT:    v_mov_b32_e32 v24, v7
-; GFX1030-NEXT:    v_mov_b32_e32 v25, v8
-; GFX1030-NEXT:    v_mov_b32_e32 v26, v10
-; GFX1030-NEXT:    v_mov_b32_e32 v27, v11
-; GFX1030-NEXT:    v_mov_b32_e32 v28, v12
+; GFX1030-NEXT:    v_mov_b32_e32 v15, v0
+; GFX1030-NEXT:    v_mov_b32_e32 v16, v1
+; GFX1030-NEXT:    v_mov_b32_e32 v17, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v18, v3
+; GFX1030-NEXT:    v_mov_b32_e32 v19, v4
+; GFX1030-NEXT:    v_mov_b32_e32 v20, v5
+; GFX1030-NEXT:    v_mov_b32_e32 v21, v6
+; GFX1030-NEXT:    v_mov_b32_e32 v22, v7
+; GFX1030-NEXT:    v_mov_b32_e32 v23, v8
+; GFX1030-NEXT:    v_mov_b32_e32 v24, v9
+; GFX1030-NEXT:    v_mov_b32_e32 v25, v10
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1030-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GFX1030-NEXT:    v_readfirstlane_b32 s4, v14
-; GFX1030-NEXT:    v_readfirstlane_b32 s5, v15
-; GFX1030-NEXT:    v_readfirstlane_b32 s6, v16
-; GFX1030-NEXT:    v_readfirstlane_b32 s7, v17
-; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
-; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
+; GFX1030-NEXT:    v_readfirstlane_b32 s4, v11
+; GFX1030-NEXT:    v_readfirstlane_b32 s5, v12
+; GFX1030-NEXT:    v_readfirstlane_b32 s6, v13
+; GFX1030-NEXT:    v_readfirstlane_b32 s7, v14
+; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
+; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
 ; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[18:33], s[4:7]
-; GFX1030-NEXT:    ; implicit-def: $vgpr14_vgpr15
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[15:30], s[4:7]
+; GFX1030-NEXT:    ; implicit-def: $vgpr11_vgpr12
+; GFX1030-NEXT:    ; implicit-def: $vgpr15
+; GFX1030-NEXT:    ; implicit-def: $vgpr16
+; GFX1030-NEXT:    ; implicit-def: $vgpr17
 ; GFX1030-NEXT:    ; implicit-def: $vgpr18
 ; GFX1030-NEXT:    ; implicit-def: $vgpr19
 ; GFX1030-NEXT:    ; implicit-def: $vgpr20
@ -178,10 +157,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
 ; GFX1030-NEXT:    ; implicit-def: $vgpr23
 ; GFX1030-NEXT:    ; implicit-def: $vgpr24
 ; GFX1030-NEXT:    ; implicit-def: $vgpr25
-; GFX1030-NEXT:    ; implicit-def: $vgpr26
-; GFX1030-NEXT:    ; implicit-def: $vgpr27
-; GFX1030-NEXT:    ; implicit-def: $vgpr28
-; GFX1030-NEXT:    ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17
+; GFX1030-NEXT:    ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
 ; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1030-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX1030-NEXT:  ; %bb.2:
@ -191,28 +167,24 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
 ;
 ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
 ; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    v_mov_b32_e32 v5, v6
-; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
-; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
-; GFX1013-NEXT:    v_mov_b32_e32 v8, v10
-; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
-; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
-; GFX1013-NEXT:    v_mov_b32_e32 v18, v14
-; GFX1013-NEXT:    v_mov_b32_e32 v19, v15
+; GFX1013-NEXT:    v_mov_b32_e32 v16, v11
+; GFX1013-NEXT:    v_mov_b32_e32 v17, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v18, v13
+; GFX1013-NEXT:    v_mov_b32_e32 v19, v14
 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1013-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT:    v_readfirstlane_b32 s4, v18
-; GFX1013-NEXT:    v_readfirstlane_b32 s5, v19
-; GFX1013-NEXT:    v_readfirstlane_b32 s6, v16
-; GFX1013-NEXT:    v_readfirstlane_b32 s7, v17
-; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
-; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
+; GFX1013-NEXT:    v_readfirstlane_b32 s4, v16
+; GFX1013-NEXT:    v_readfirstlane_b32 s5, v17
+; GFX1013-NEXT:    v_readfirstlane_b32 s6, v18
+; GFX1013-NEXT:    v_readfirstlane_b32 s7, v19
+; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
+; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
 ; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1013-NEXT:    image_bvh_intersect_ray v[20:23], v[0:15], s[4:7]
+; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17
 ; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX1013-NEXT:    ; implicit-def: $vgpr18_vgpr19
-; GFX1013-NEXT:    ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17
+; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
 ; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1013-NEXT:    s_cbranch_execnz .LBB6_1
@ -224,15 +196,186 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
 ; GFX1013-NEXT:    v_mov_b32_e32 v2, v22
 ; GFX1013-NEXT:    v_mov_b32_e32 v3, v23
 ; GFX1013-NEXT:    ; return to shader part epilog
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
 }

-define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
+define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
 ; GFX1030:       ; %bb.0:
 ; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
+; GFX1030-NEXT:    v_mov_b32_e32 v13, v0
+; GFX1030-NEXT:    v_mov_b32_e32 v14, v1
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v7
+; GFX1030-NEXT:    v_mov_b32_e32 v15, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v16, v3
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_and_b32_e32 v3, s0, v8
+; GFX1030-NEXT:    v_mov_b32_e32 v17, v4
+; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1030-NEXT:    v_and_or_b32 v18, v5, s0, v0
+; GFX1030-NEXT:    v_and_or_b32 v19, v6, s0, v1
+; GFX1030-NEXT:    v_lshl_or_b32 v20, v3, 16, v2
+; GFX1030-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX1030-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX1030-NEXT:    v_readfirstlane_b32 s6, v11
+; GFX1030-NEXT:    v_readfirstlane_b32 s7, v12
+; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
+; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[13:20], s[4:7] a16
+; GFX1030-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1030-NEXT:    ; implicit-def: $vgpr13
+; GFX1030-NEXT:    ; implicit-def: $vgpr14
+; GFX1030-NEXT:    ; implicit-def: $vgpr15
+; GFX1030-NEXT:    ; implicit-def: $vgpr16
+; GFX1030-NEXT:    ; implicit-def: $vgpr17
+; GFX1030-NEXT:    ; implicit-def: $vgpr18
+; GFX1030-NEXT:    ; implicit-def: $vgpr19
+; GFX1030-NEXT:    ; implicit-def: $vgpr20
+; GFX1030-NEXT:    ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
+; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1030-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1030-NEXT:  ; %bb.2:
+; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
+; GFX1013:       ; %bb.0:
+; GFX1013-NEXT:    s_mov_b32 s0, 0xffff
+; GFX1013-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX1013-NEXT:    v_and_b32_e32 v14, s0, v7
+; GFX1013-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX1013-NEXT:    v_and_b32_e32 v8, s0, v8
+; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX1013-NEXT:    v_lshl_or_b32 v7, v8, 16, v7
+; GFX1013-NEXT:    v_and_or_b32 v5, v5, s0, v13
+; GFX1013-NEXT:    v_and_or_b32 v6, v6, s0, v14
+; GFX1013-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1013-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX1013-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX1013-NEXT:    v_readfirstlane_b32 s6, v11
+; GFX1013-NEXT:    v_readfirstlane_b32 s7, v12
+; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
+; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1013-NEXT:    image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16
+; GFX1013-NEXT:    ; implicit-def: $vgpr9_vgpr10
+; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX1013-NEXT:    ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12
+; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1013-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1013-NEXT:  ; %bb.2:
+; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1013-NEXT:    s_waitcnt vmcnt(0)
+; GFX1013-NEXT:    v_mov_b32_e32 v0, v13
+; GFX1013-NEXT:    v_mov_b32_e32 v1, v14
+; GFX1013-NEXT:    v_mov_b32_e32 v2, v15
+; GFX1013-NEXT:    v_mov_b32_e32 v3, v16
+; GFX1013-NEXT:    ; return to shader part epilog
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %r = bitcast <4 x i32> %v to <4 x float>
+  ret <4 x float> %r
+}
+
+define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) {
+; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    v_mov_b32_e32 v16, v0
+; GFX1030-NEXT:    v_mov_b32_e32 v17, v1
+; GFX1030-NEXT:    v_mov_b32_e32 v18, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v19, v3
+; GFX1030-NEXT:    v_mov_b32_e32 v20, v4
+; GFX1030-NEXT:    v_mov_b32_e32 v21, v5
+; GFX1030-NEXT:    v_mov_b32_e32 v22, v6
+; GFX1030-NEXT:    v_mov_b32_e32 v23, v7
+; GFX1030-NEXT:    v_mov_b32_e32 v24, v8
+; GFX1030-NEXT:    v_mov_b32_e32 v25, v9
+; GFX1030-NEXT:    v_mov_b32_e32 v26, v10
+; GFX1030-NEXT:    v_mov_b32_e32 v27, v11
+; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1030-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT:    v_readfirstlane_b32 s4, v12
+; GFX1030-NEXT:    v_readfirstlane_b32 s5, v13
+; GFX1030-NEXT:    v_readfirstlane_b32 s6, v14
+; GFX1030-NEXT:    v_readfirstlane_b32 s7, v15
+; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
+; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
+; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[16:31], s[4:7]
+; GFX1030-NEXT:    ; implicit-def: $vgpr12_vgpr13
+; GFX1030-NEXT:    ; implicit-def: $vgpr16
+; GFX1030-NEXT:    ; implicit-def: $vgpr17
+; GFX1030-NEXT:    ; implicit-def: $vgpr18
+; GFX1030-NEXT:    ; implicit-def: $vgpr19
+; GFX1030-NEXT:    ; implicit-def: $vgpr20
+; GFX1030-NEXT:    ; implicit-def: $vgpr21
+; GFX1030-NEXT:    ; implicit-def: $vgpr22
+; GFX1030-NEXT:    ; implicit-def: $vgpr23
+; GFX1030-NEXT:    ; implicit-def: $vgpr24
+; GFX1030-NEXT:    ; implicit-def: $vgpr25
+; GFX1030-NEXT:    ; implicit-def: $vgpr26
+; GFX1030-NEXT:    ; implicit-def: $vgpr27
+; GFX1030-NEXT:    ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15
+; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1030-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1030-NEXT:  ; %bb.2:
+; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
+; GFX1013:       ; %bb.0:
+; GFX1013-NEXT:    v_mov_b32_e32 v16, v12
+; GFX1013-NEXT:    v_mov_b32_e32 v17, v13
+; GFX1013-NEXT:    v_mov_b32_e32 v18, v14
+; GFX1013-NEXT:    v_mov_b32_e32 v19, v15
+; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1013-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1013-NEXT:    v_readfirstlane_b32 s4, v16
+; GFX1013-NEXT:    v_readfirstlane_b32 s5, v17
+; GFX1013-NEXT:    v_readfirstlane_b32 s6, v18
+; GFX1013-NEXT:    v_readfirstlane_b32 s7, v19
+; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17]
+; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[18:19]
+; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1013-NEXT:    image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7]
+; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17
+; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX1013-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19
+; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1013-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1013-NEXT:  ; %bb.2:
+; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1013-NEXT:    s_waitcnt vmcnt(0)
+; GFX1013-NEXT:    v_mov_b32_e32 v0, v20
+; GFX1013-NEXT:    v_mov_b32_e32 v1, v21
+; GFX1013-NEXT:    v_mov_b32_e32 v2, v22
+; GFX1013-NEXT:    v_mov_b32_e32 v3, v23
+; GFX1013-NEXT:    ; return to shader part epilog
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %r = bitcast <4 x i32> %v to <4 x float>
+  ret <4 x float> %r
+}
+
+define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
+; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX1030-NEXT:    v_mov_b32_e32 v14, v0
 ; GFX1030-NEXT:    v_mov_b32_e32 v15, v1
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
@ -244,11 +387,12 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1030-NEXT:    v_and_b32_e32 v3, s0, v9
 ; GFX1030-NEXT:    v_mov_b32_e32 v18, v4
+; GFX1030-NEXT:    v_mov_b32_e32 v19, v5
+; GFX1030-NEXT:    v_and_or_b32 v20, v6, s0, v0
+; GFX1030-NEXT:    v_and_or_b32 v21, v7, s0, v1
+; GFX1030-NEXT:    v_lshl_or_b32 v22, v3, 16, v2
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1030-NEXT:    v_and_or_b32 v19, v6, s0, v0
-; GFX1030-NEXT:    v_and_or_b32 v20, v7, s0, v1
-; GFX1030-NEXT:    v_lshl_or_b32 v21, v3, 16, v2
-; GFX1030-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1030-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v10
 ; GFX1030-NEXT:    v_readfirstlane_b32 s5, v11
 ; GFX1030-NEXT:    v_readfirstlane_b32 s6, v12
@ -257,7 +401,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
 ; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[14:21], s[4:7] a16
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[14:29], s[4:7] a16
 ; GFX1030-NEXT:    ; implicit-def: $vgpr10_vgpr11
 ; GFX1030-NEXT:    ; implicit-def: $vgpr14
 ; GFX1030-NEXT:    ; implicit-def: $vgpr15
@ -267,185 +411,9 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ; GFX1030-NEXT:    ; implicit-def: $vgpr19
 ; GFX1030-NEXT:    ; implicit-def: $vgpr20
 ; GFX1030-NEXT:    ; implicit-def: $vgpr21
+; GFX1030-NEXT:    ; implicit-def: $vgpr22
 ; GFX1030-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
 ; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1030-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1030-NEXT:  ; %bb.2:
-; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    ; return to shader part epilog
-;
-; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
-; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    s_mov_b32 s0, 0xffff
-; GFX1013-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX1013-NEXT:    v_and_b32_e32 v14, s0, v8
-; GFX1013-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX1013-NEXT:    v_and_b32_e32 v9, s0, v9
-; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1013-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX1013-NEXT:    v_and_or_b32 v5, v6, s0, v5
-; GFX1013-NEXT:    v_and_or_b32 v6, v7, s0, v14
-; GFX1013-NEXT:    v_lshl_or_b32 v7, v9, 16, v8
-; GFX1013-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT:    v_readfirstlane_b32 s4, v10
-; GFX1013-NEXT:    v_readfirstlane_b32 s5, v11
-; GFX1013-NEXT:    v_readfirstlane_b32 s6, v12
-; GFX1013-NEXT:    v_readfirstlane_b32 s7, v13
-; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
-; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
-; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1013-NEXT:    image_bvh_intersect_ray v[14:17], v[0:7], s[4:7] a16
-; GFX1013-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX1013-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13
-; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1013-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX1013-NEXT:  ; %bb.2:
-; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
-; GFX1013-NEXT:    s_waitcnt vmcnt(0)
-; GFX1013-NEXT:    v_mov_b32_e32 v0, v14
-; GFX1013-NEXT:    v_mov_b32_e32 v1, v15
-; GFX1013-NEXT:    v_mov_b32_e32 v2, v16
-; GFX1013-NEXT:    v_mov_b32_e32 v3, v17
-; GFX1013-NEXT:    ; return to shader part epilog
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
-  %r = bitcast <4 x i32> %v to <4 x float>
-  ret <4 x float> %r
-}
-
-define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
-; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    v_mov_b32_e32 v19, v0
-; GFX1030-NEXT:    v_mov_b32_e32 v20, v1
-; GFX1030-NEXT:    v_mov_b32_e32 v21, v2
-; GFX1030-NEXT:    v_mov_b32_e32 v22, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v23, v4
-; GFX1030-NEXT:    v_mov_b32_e32 v24, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v25, v7
-; GFX1030-NEXT:    v_mov_b32_e32 v26, v8
-; GFX1030-NEXT:    v_mov_b32_e32 v27, v9
-; GFX1030-NEXT:    v_mov_b32_e32 v28, v11
-; GFX1030-NEXT:    v_mov_b32_e32 v29, v12
-; GFX1030-NEXT:    v_mov_b32_e32 v30, v13
-; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1030-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX1030-NEXT:    v_readfirstlane_b32 s4, v15
-; GFX1030-NEXT:    v_readfirstlane_b32 s5, v16
-; GFX1030-NEXT:    v_readfirstlane_b32 s6, v17
-; GFX1030-NEXT:    v_readfirstlane_b32 s7, v18
-; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
-; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
-; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[19:34], s[4:7]
-; GFX1030-NEXT:    ; implicit-def: $vgpr15_vgpr16
-; GFX1030-NEXT:    ; implicit-def: $vgpr19
-; GFX1030-NEXT:    ; implicit-def: $vgpr20
-; GFX1030-NEXT:    ; implicit-def: $vgpr21
-; GFX1030-NEXT:    ; implicit-def: $vgpr22
-; GFX1030-NEXT:    ; implicit-def: $vgpr23
-; GFX1030-NEXT:    ; implicit-def: $vgpr24
-; GFX1030-NEXT:    ; implicit-def: $vgpr25
-; GFX1030-NEXT:    ; implicit-def: $vgpr26
-; GFX1030-NEXT:    ; implicit-def: $vgpr27
-; GFX1030-NEXT:    ; implicit-def: $vgpr28
-; GFX1030-NEXT:    ; implicit-def: $vgpr29
-; GFX1030-NEXT:    ; implicit-def: $vgpr30
-; GFX1030-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18
-; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1030-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1030-NEXT:  ; %bb.2:
-; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    ; return to shader part epilog
-;
-; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
-; GFX1013:       ; %bb.0:
-; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
-; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
-; GFX1013-NEXT:    v_mov_b32_e32 v8, v9
-; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
-; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
-; GFX1013-NEXT:    v_mov_b32_e32 v11, v13
-; GFX1013-NEXT:    v_mov_b32_e32 v19, v15
-; GFX1013-NEXT:    v_mov_b32_e32 v20, v16
-; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1013-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX1013-NEXT:    v_readfirstlane_b32 s4, v19
-; GFX1013-NEXT:    v_readfirstlane_b32 s5, v20
-; GFX1013-NEXT:    v_readfirstlane_b32 s6, v17
-; GFX1013-NEXT:    v_readfirstlane_b32 s7, v18
-; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20]
-; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
-; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1013-NEXT:    image_bvh64_intersect_ray v[21:24], v[0:15], s[4:7]
-; GFX1013-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX1013-NEXT:    ; implicit-def: $vgpr19_vgpr20
-; GFX1013-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18
-; GFX1013-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1013-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX1013-NEXT:  ; %bb.2:
-; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
-; GFX1013-NEXT:    s_waitcnt vmcnt(0)
-; GFX1013-NEXT:    v_mov_b32_e32 v0, v21
-; GFX1013-NEXT:    v_mov_b32_e32 v1, v22
-; GFX1013-NEXT:    v_mov_b32_e32 v2, v23
-; GFX1013-NEXT:    v_mov_b32_e32 v3, v24
-; GFX1013-NEXT:    ; return to shader part epilog
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
-  %r = bitcast <4 x i32> %v to <4 x float>
-  ret <4 x float> %r
-}
-
-define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
-; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
-; GFX1030-NEXT:    v_mov_b32_e32 v15, v0
-; GFX1030-NEXT:    v_mov_b32_e32 v16, v1
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v9
-; GFX1030-NEXT:    v_mov_b32_e32 v17, v2
-; GFX1030-NEXT:    v_mov_b32_e32 v18, v3
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_and_b32_e32 v3, s0, v10
-; GFX1030-NEXT:    v_mov_b32_e32 v19, v4
-; GFX1030-NEXT:    v_mov_b32_e32 v20, v5
-; GFX1030-NEXT:    v_and_or_b32 v21, v7, s0, v0
-; GFX1030-NEXT:    v_and_or_b32 v22, v8, s0, v1
-; GFX1030-NEXT:    v_lshl_or_b32 v23, v3, 16, v2
-; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1030-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX1030-NEXT:    v_readfirstlane_b32 s4, v11
-; GFX1030-NEXT:    v_readfirstlane_b32 s5, v12
-; GFX1030-NEXT:    v_readfirstlane_b32 s6, v13
-; GFX1030-NEXT:    v_readfirstlane_b32 s7, v14
-; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
-; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
-; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[15:30], s[4:7] a16
-; GFX1030-NEXT:    ; implicit-def: $vgpr11_vgpr12
-; GFX1030-NEXT:    ; implicit-def: $vgpr15
-; GFX1030-NEXT:    ; implicit-def: $vgpr16
-; GFX1030-NEXT:    ; implicit-def: $vgpr17
-; GFX1030-NEXT:    ; implicit-def: $vgpr18
-; GFX1030-NEXT:    ; implicit-def: $vgpr19
-; GFX1030-NEXT:    ; implicit-def: $vgpr20
-; GFX1030-NEXT:    ; implicit-def: $vgpr21
-; GFX1030-NEXT:    ; implicit-def: $vgpr22
-; GFX1030-NEXT:    ; implicit-def: $vgpr23
-; GFX1030-NEXT:    ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14
-; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX1030-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX1030-NEXT:  ; %bb.2:
 ; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
@ -455,20 +423,20 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
 ; GFX1013:       ; %bb.0:
 ; GFX1013-NEXT:    s_mov_b32 s0, 0xffff
-; GFX1013-NEXT:    v_mov_b32_e32 v16, v11
-; GFX1013-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GFX1013-NEXT:    v_and_b32_e32 v11, s0, v9
-; GFX1013-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX1013-NEXT:    v_and_b32_e32 v10, s0, v10
-; GFX1013-NEXT:    v_mov_b32_e32 v17, v12
-; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX1013-NEXT:    v_mov_b32_e32 v16, v10
+; GFX1013-NEXT:    v_mov_b32_e32 v17, v11
+; GFX1013-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GFX1013-NEXT:    v_and_b32_e32 v11, s0, v8
+; GFX1013-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX1013-NEXT:    v_and_b32_e32 v9, s0, v9
+; GFX1013-NEXT:    v_mov_b32_e32 v18, v12
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX1013-NEXT:    v_mov_b32_e32 v18, v13
-; GFX1013-NEXT:    v_mov_b32_e32 v19, v14
+; GFX1013-NEXT:    v_mov_b32_e32 v19, v13
+; GFX1013-NEXT:    v_lshl_or_b32 v8, v9, 16, v8
 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1013-NEXT:    v_and_or_b32 v6, v7, s0, v6
-; GFX1013-NEXT:    v_and_or_b32 v7, v8, s0, v11
-; GFX1013-NEXT:    v_lshl_or_b32 v8, v10, 16, v9
+; GFX1013-NEXT:    v_and_or_b32 v6, v6, s0, v10
+; GFX1013-NEXT:    v_and_or_b32 v7, v7, s0, v11
 ; GFX1013-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1013-NEXT:    v_readfirstlane_b32 s4, v16
 ; GFX1013-NEXT:    v_readfirstlane_b32 s5, v17
@ -493,7 +461,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ; GFX1013-NEXT:    v_mov_b32_e32 v2, v22
 ; GFX1013-NEXT:    v_mov_b32_e32 v3, v23
 ; GFX1013-NEXT:    ; return to shader part epilog
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
 }
@ -567,16 +535,16 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
  %node_ptr = load i32, i32* %gep_node_ptr, align 4
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
-  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
-  %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
-  %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
-  %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
-  %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
-  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
-  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
+  %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
+  %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
+  %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
+  %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
+  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
+  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
 }
@ -680,16 +648,16 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
  %node_ptr = load i32, i32* %gep_node_ptr, align 4
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
-  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
-  %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
-  %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
-  %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
-  %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
-  %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
-  %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
+  %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
+  %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
+  %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
+  %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
+  %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
+  %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
 }
@ -755,16 +723,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
-  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
-  %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
-  %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
-  %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
-  %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
-  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
-  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
+  %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
+  %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
+  %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
+  %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
+  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
+  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
 }
@ -860,16 +828,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
-  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
-  %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
-  %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
-  %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
-  %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
-  %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
-  %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
+  %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
+  %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
+  %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
+  %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
+  %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
+  %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
 }
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@ -3,15 +3,15 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
 ; RUN: not --crash llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s

-; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
-; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
-; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
-; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
+; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)

-declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
-declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
-declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
-declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
+declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)

 ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
 ; Arguments are flattened to represent the actual VGPR_A layout, so we have no
@ -23,43 +23,43 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 main_body:
-  %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
-  %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
-  %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
-  %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
-  %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
-  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
-  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
+  %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
+  %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
+  %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
+  %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
+  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
+  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
 %r = bitcast <4 x i32> %v to <4 x float>
 ret <4 x float> %r
 }

-define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
+define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh_intersect_ray_a16:
 ; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_lshr_b32 s5, s8, 16
-; GCN-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GCN-NEXT:    s_pack_ll_b32_b16 s5, s5, s9
+; GCN-NEXT:    s_mov_b32 s15, s12
+; GCN-NEXT:    s_mov_b32 s12, s9
+; GCN-NEXT:    s_lshr_b32 s9, s7, 16
+; GCN-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
+; GCN-NEXT:    s_pack_ll_b32_b16 s7, s9, s8
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NEXT:    v_mov_b32_e32 v5, s6
-; GCN-NEXT:    v_mov_b32_e32 v6, s7
-; GCN-NEXT:    v_mov_b32_e32 v7, s5
-; GCN-NEXT:    s_mov_b32 s15, s13
-; GCN-NEXT:    s_mov_b32 s14, s12
-; GCN-NEXT:    s_mov_b32 s13, s11
-; GCN-NEXT:    s_mov_b32 s12, s10
+; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    v_mov_b32_e32 v6, s6
+; GCN-NEXT:    v_mov_b32_e32 v7, s7
+; GCN-NEXT:    s_mov_b32 s14, s11
+; GCN-NEXT:    s_mov_b32 s13, s10
 ; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 main_body:
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
 }
@ -74,44 +74,44 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec,
 ; GCN-NEXT:    ; return to shader part epilog
 main_body:
  %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
-  %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
-  %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
-  %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
-  %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
-  %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
-  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
-  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
+  %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
+  %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
+  %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
+  %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
+  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
+  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
 %r = bitcast <4 x i32> %v to <4 x float>
 ret <4 x float> %r
 }

-define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
+define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
 ; GCN-LABEL: image_bvh64_intersect_ray_a16:
 ; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_lshr_b32 s6, s9, 16
-; GCN-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GCN-NEXT:    s_pack_ll_b32_b16 s6, s6, s10
+; GCN-NEXT:    s_mov_b32 s14, s12
+; GCN-NEXT:    s_mov_b32 s12, s10
+; GCN-NEXT:    s_lshr_b32 s10, s8, 16
+; GCN-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GCN-NEXT:    s_pack_ll_b32_b16 s8, s10, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    v_mov_b32_e32 v4, s4
 ; GCN-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NEXT:    v_mov_b32_e32 v6, s7
-; GCN-NEXT:    v_mov_b32_e32 v7, s8
-; GCN-NEXT:    v_mov_b32_e32 v8, s6
-; GCN-NEXT:    s_mov_b32 s15, s14
-; GCN-NEXT:    s_mov_b32 s14, s13
-; GCN-NEXT:    s_mov_b32 s13, s12
-; GCN-NEXT:    s_mov_b32 s12, s11
+; GCN-NEXT:    v_mov_b32_e32 v6, s6
+; GCN-NEXT:    v_mov_b32_e32 v7, s7
+; GCN-NEXT:    v_mov_b32_e32 v8, s8
+; GCN-NEXT:    s_mov_b32 s15, s13
+; GCN-NEXT:    s_mov_b32 s13, s11
 ; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 main_body:
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
  %r = bitcast <4 x i32> %v to <4 x float>
  ret <4 x float> %r
 }
@ -178,16 +178,16 @@ main_body:
  %node_ptr = load i32, i32* %gep_node_ptr, align 4
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
-  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
-  %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
-  %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
-  %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
-  %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
-  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
-  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
+  %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
+  %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
+  %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
+  %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
+  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
+  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
 }
@ -246,16 +246,16 @@ main_body:
  %node_ptr = load i32, i32* %gep_node_ptr, align 4
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
-  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
-  %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
-  %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
-  %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
-  %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
-  %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
-  %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
+  %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
+  %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
+  %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
+  %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
+  %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
+  %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
 }
@ -316,16 +316,16 @@ main_body:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
-  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
-  %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
-  %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
-  %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
-  %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
-  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
-  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
+  %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
+  %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
+  %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
+  %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
+  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
+  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
 }
@ -380,16 +380,16 @@ main_body:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
  %ray_extent = load float, float* %gep_ray, align 4
-  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
-  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
-  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
-  %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
-  %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
-  %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
-  %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
-  %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
-  %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
-  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
+  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
+  %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
+  %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
+  %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
+  %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
+  %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
+  %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
+  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
  store <4 x i32> %v, <4 x i32>* undef
  ret void
 }