llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll

723 lines
37 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -O0 -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -global-isel %s -o - | FileCheck -check-prefix=HSA-VI %s
define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
; HSA-VI-LABEL: name: i8_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
%ext = zext i8 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
; HSA-VI-LABEL: name: i8_zext_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
%ext = zext i8 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
; HSA-VI-LABEL: name: i8_sext_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
%ext = sext i8 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
; HSA-VI-LABEL: name: i16_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
%ext = zext i16 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
; HSA-VI-LABEL: name: i16_zext_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
%ext = zext i16 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
; HSA-VI-LABEL: name: i16_sext_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2, align 8, addrspace 4)
; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s16)
; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
%ext = sext i16 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
; HSA-VI-LABEL: name: i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store i32 %in, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
; HSA-VI-LABEL: name: f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store float %in, float addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
; HSA-VI-LABEL: name: v2i8_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 2, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<2 x s8>), [[LOAD]](p1) :: (store 2 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <2 x i8> %in, <2 x i8> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
; HSA-VI-LABEL: name: v2i16_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<2 x s16>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <2 x i16> %in, <2 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
; HSA-VI-LABEL: name: v2i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
; HSA-VI-LABEL: name: v2f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
; HSA-VI-LABEL: name: v3i8_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 3, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<3 x s8>), [[LOAD]](p1) :: (store 3 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
; HSA-VI-LABEL: name: v3i16_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 6, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<3 x s16>), [[LOAD]](p1) :: (store 6 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
; HSA-VI-LABEL: name: v3i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 12, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
; HSA-VI-LABEL: name: v3f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 12, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
; HSA-VI-LABEL: name: v4i8_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<4 x s8>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <4 x i8> %in, <4 x i8> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
; HSA-VI-LABEL: name: v4i16_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<4 x s16>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <4 x i16> %in, <4 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
; HSA-VI-LABEL: name: v4i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
; HSA-VI-LABEL: name: v4f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
; HSA-VI-LABEL: name: v8i8_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<8 x s8>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <8 x i8> %in, <8 x i8> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
; HSA-VI-LABEL: name: v8i16_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<8 x s16>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <8 x i16> %in, <8 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
; HSA-VI-LABEL: name: v8i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 32, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
; HSA-VI-LABEL: name: v8f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 32, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
; HSA-VI-LABEL: name: v16i8_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<16 x s8>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <16 x i8> %in, <16 x i8> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
; HSA-VI-LABEL: name: v16i16_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 32, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<16 x s16>), [[LOAD]](p1) :: (store 32 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <16 x i16> %in, <16 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
; HSA-VI-LABEL: name: v16i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 64, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
; HSA-VI-LABEL: name: v16f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 64, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
; HSA-VI-LABEL: name: kernel_arg_i64
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
store i64 %a, i64 addrspace(1)* %out, align 8
ret void
}
define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) {
; HSA-VI-LABEL: name: f64_kernel_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
entry:
store double %in, double addrspace(1)* %out
ret void
}
define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
; HSA-VI-LABEL: name: i1_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](s1), [[LOAD]](p1) :: (store 1 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
store i1 %x, i1 addrspace(1)* %out, align 1
ret void
}
define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
; HSA-VI-LABEL: name: i1_arg_zext_i32
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s1)
; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
%ext = zext i1 %x to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
; HSA-VI-LABEL: name: i1_arg_zext_i64
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD1]](s1)
; HSA-VI: G_STORE [[ZEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
%ext = zext i1 %x to i64
store i64 %ext, i64 addrspace(1)* %out, align 8
ret void
}
define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
; HSA-VI-LABEL: name: i1_arg_sext_i32
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s1)
; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
%ext = sext i1 %x to i32
store i32 %ext, i32addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
; HSA-VI-LABEL: name: i1_arg_sext_i64
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4)
; HSA-VI: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD1]](s1)
; HSA-VI: G_STORE [[SEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM 0
%ext = sext i1 %x to i64
store i64 %ext, i64 addrspace(1)* %out, align 8
ret void
}
define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
; HSA-VI-LABEL: name: empty_struct_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: S_ENDPGM 0
ret void
}
; The correct load offsets for these:
; load 4 from 0,
; load 8 from 8
; load 4 from 24
; load 8 from 32
; With the SelectionDAG argument lowering, the alignments for the
; struct members is not properly considered, making these wrong.
; FIXME: GlobalISel extractvalue emission broken
define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
; %val0 = extractvalue {i32, i64} %arg0, 0
; %val1 = extractvalue {i32, i64} %arg0, 1
; %val2 = extractvalue {i32, i64} %arg1, 0
; %val3 = extractvalue {i32, i64} %arg1, 1
; store volatile i32 %val0, i32 addrspace(1)* null
; store volatile i64 %val1, i64 addrspace(1)* null
; store volatile i32 %val2, i32 addrspace(1)* null
; store volatile i64 %val3, i64 addrspace(1)* null
; HSA-VI-LABEL: name: struct_argument_alignment
; HSA-VI: bb.1 (%ir-block.1):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 16, addrspace 4)
[GlobalISel] Accept multiple vregs in lowerFormalArgs Change the interface of CallLowering::lowerFormalArguments to accept several virtual registers for each formal argument, instead of just one. This is a follow-up to D46018. CallLowering::lowerReturn was similarly refactored in D49660. lowerCall will be refactored in the same way in follow-up patches. With this change, we forward the virtual registers generated for aggregates to CallLowering. Therefore, the target can decide itself whether it wants to handle them as separate pieces or use one big register. We also copy the pack/unpackRegs helpers to CallLowering to facilitate this. ARM and AArch64 have been updated to use the passed in virtual registers directly, which means we no longer need to generate so many merge/extract instructions. AArch64 seems to have had a bug when lowering e.g. [1 x i8*], which was put into a s64 instead of a p0. Added a test-case which illustrates the problem more clearly (it crashes without this patch) and fixed the existing test-case to expect p0. AMDGPU has been updated to unpack into the virtual registers for kernels. I think the other code paths fall back for aggregates, so this should be NFC. Mips doesn't support aggregates yet, so it's also NFC. x86 seems to have code for dealing with aggregates, but I couldn't find the tests for it, so I just added a fallback to DAGISel if we get more than one virtual register for an argument. Differential Revision: https://reviews.llvm.org/D63549 llvm-svn: 364510
2019-06-27 16:54:17 +08:00
; HSA-VI: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s128), 0
; HSA-VI: [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s128), 64
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 16, addrspace 4)
; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
; HSA-VI: [[GEP2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
; HSA-VI: [[LOAD2:%[0-9]+]]:_(s128) = G_LOAD [[GEP2]](p4) :: (dereferenceable invariant load 16, align 8, addrspace 4)
; HSA-VI: [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s128), 0
; HSA-VI: [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s128), 64
; HSA-VI: S_ENDPGM 0
ret void
}
; No padding between i8 and next struct, but round up at end to 4 byte
; multiple.
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
; %val0 = extractvalue <{i32, i64}> %arg0, 0
; %val1 = extractvalue <{i32, i64}> %arg0, 1
; %val2 = extractvalue <{i32, i64}> %arg1, 0
; %val3 = extractvalue <{i32, i64}> %arg1, 1
; store volatile i32 %val0, i32 addrspace(1)* null
; store volatile i64 %val1, i64 addrspace(1)* null
; store volatile i32 %val2, i32 addrspace(1)* null
; store volatile i64 %val3, i64 addrspace(1)* null
; HSA-VI-LABEL: name: packed_struct_argument_alignment
; HSA-VI: bb.1 (%ir-block.1):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 12, align 16, addrspace 4)
[GlobalISel] Accept multiple vregs in lowerFormalArgs Change the interface of CallLowering::lowerFormalArguments to accept several virtual registers for each formal argument, instead of just one. This is a follow-up to D46018. CallLowering::lowerReturn was similarly refactored in D49660. lowerCall will be refactored in the same way in follow-up patches. With this change, we forward the virtual registers generated for aggregates to CallLowering. Therefore, the target can decide itself whether it wants to handle them as separate pieces or use one big register. We also copy the pack/unpackRegs helpers to CallLowering to facilitate this. ARM and AArch64 have been updated to use the passed in virtual registers directly, which means we no longer need to generate so many merge/extract instructions. AArch64 seems to have had a bug when lowering e.g. [1 x i8*], which was put into a s64 instead of a p0. Added a test-case which illustrates the problem more clearly (it crashes without this patch) and fixed the existing test-case to expect p0. AMDGPU has been updated to unpack into the virtual registers for kernels. I think the other code paths fall back for aggregates, so this should be NFC. Mips doesn't support aggregates yet, so it's also NFC. x86 seems to have code for dealing with aggregates, but I couldn't find the tests for it, so I just added a fallback to DAGISel if we get more than one virtual register for an argument. Differential Revision: https://reviews.llvm.org/D63549 llvm-svn: 364510
2019-06-27 16:54:17 +08:00
; HSA-VI: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s96), 0
; HSA-VI: [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s96), 32
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 1, align 4, addrspace 4)
; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 13
; HSA-VI: [[GEP2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
; HSA-VI: [[LOAD2:%[0-9]+]]:_(s96) = G_LOAD [[GEP2]](p4) :: (dereferenceable invariant load 12, align 1, addrspace 4)
; HSA-VI: [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s96), 0
; HSA-VI: [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s96), 32
; HSA-VI: S_ENDPGM 0
ret void
}