[NVPTX] Unify vectorization of load/stores of aggregate arguments and return values.

Original code only used vector loads/stores for explicit vector arguments.
It could also do more loads/stores than necessary (e.g v5f32 would
touch 8 f32 values). Aggregate types were loaded one element at a time,
even the vectors contained within.

This change attempts to generalize (and simplify) parameter space
loads/stores so that vector loads/stores can be used more broadly.
Functionality of the patch has been verified by compiling thrust
test suite and manually checking the differences between PTX
generated by llvm with and without the patch.

General algorithm:
* ComputePTXValueVTs() flattens input/output argument into a flat list
  of scalars to load/store and returns their types and offsets.
* VectorizePTXValueVTs() uses that data to create vectorization plan
  which returns an array of flags marking boundaries of vectorized
  load/stores. Scalars are represented as 1-element vectors.
* Code that generates loads/stores implements a simple state machine
  that constructs a vector according to the plan.

Differential Revision: https://reviews.llvm.org/D30011

llvm-svn: 295784
This commit is contained in:
Artem Belevich 2017-02-21 22:56:05 +00:00
parent 7d6b71db4f
commit 29bbdc1c32
9 changed files with 1393 additions and 755 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,21 +1,40 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
declare <2 x float> @barv(<2 x float> %input)
declare <3 x float> @barv3(<3 x float> %input)
declare [2 x float] @bara([2 x float] %input)
declare {float, float} @bars({float, float} %input)
define void @foov(<2 x float> %input, <2 x float>* %output) {
; CHECK-LABEL: @foov
define void @test_v2f32(<2 x float> %input, <2 x float>* %output) {
; CHECK-LABEL: @test_v2f32
%call = tail call <2 x float> @barv(<2 x float> %input)
; CHECK: .param .align 8 .b8 retval0[8];
; CHECK: ld.param.v2.f32 {[[ELEMV1:%f[0-9]+]], [[ELEMV2:%f[0-9]+]]}, [retval0+0];
; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0];
store <2 x float> %call, <2 x float>* %output, align 8
; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEMV1]], [[ELEMV2]]}
; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
ret void
}
define void @fooa([2 x float] %input, [2 x float]* %output) {
; CHECK-LABEL: @fooa
define void @test_v3f32(<3 x float> %input, <3 x float>* %output) {
; CHECK-LABEL: @test_v3f32
;
%call = tail call <3 x float> @barv3(<3 x float> %input)
; CHECK: .param .align 16 .b8 retval0[16];
; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0];
; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8];
; Make sure we don't load more values than than we need to.
; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12];
store <3 x float> %call, <3 x float>* %output, align 8
; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8],
; -- This is suboptimal. We should do st.v2.f32 instead
; of combining 2xf32 info i64.
; CHECK-DAG: st.u64 [{{%rd[0-9]}}],
; CHECK: ret;
ret void
}
define void @test_a2f32([2 x float] %input, [2 x float]* %output) {
; CHECK-LABEL: @test_a2f32
%call = tail call [2 x float] @bara([2 x float] %input)
; CHECK: .param .align 4 .b8 retval0[8];
; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0+0];
@ -28,8 +47,8 @@ define void @fooa([2 x float] %input, [2 x float]* %output) {
; CHECK: ret
}
define void @foos({float, float} %input, {float, float}* %output) {
; CHECK-LABEL: @foos
define void @test_s2f32({float, float} %input, {float, float}* %output) {
; CHECK-LABEL: @test_s2f32
%call = tail call {float, float} @bars({float, float} %input)
; CHECK: .param .align 4 .b8 retval0[8];
; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0+0];

View File

@ -229,7 +229,7 @@ define half @test_tailcall_flipped(half %a, half %b) #0 {
; CHECK-LABEL: test_select(
; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0];
; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1];
; CHECK: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;

View File

@ -2,8 +2,11 @@
declare <4 x float> @bar()
; CHECK-LABEL: .func foo(
define void @foo(<4 x float>* %ptr) {
; CHECK: ld.param.v4.f32
; CHECK: ld.param.u32 %[[PTR:r[0-9]+]], [foo_param_0];
; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0];
; CHECK: st.v4.f32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]}
%val = tail call <4 x float> @bar()
store <4 x float> %val, <4 x float>* %ptr
ret void

View File

@ -1,4 +1,4 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX
; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR
; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
@ -27,9 +27,9 @@ entry:
; PTX: LBB[[LABEL:[_0-9]+]]:
; PTX: ld.u8 %rs[[REG:[0-9]+]]
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
}
define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
@ -45,9 +45,9 @@ entry:
; PTX: LBB[[LABEL:[_0-9]+]]:
; PTX: ld.volatile.u8 %rs[[REG:[0-9]+]]
; PTX: st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
}
define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {
@ -78,12 +78,13 @@ entry:
; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]]
; PTX-LABEL: .visible .func (.param .b64 func_retval0) memset_caller(
; PTX: ld.param.u8 %rs[[REG:[0-9]+]]
; PTX: ld.param.u32 %r[[C:[0-9]+]]
; PTX: cvt.u16.u32 %rs[[REG:[0-9]+]], %r[[C]];
; PTX: LBB[[LABEL:[_0-9]+]]:
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
}
define i8* @volatile_memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
@ -118,7 +119,7 @@ entry:
; PTX-NEXT: @%p[[SRC_GT_THAN_DST]] bra LBB[[FORWARD_BB:[0-9_]+]]
; -- this is the backwards copying BB
; PTX: @%p[[NEQ0]] bra LBB[[EXIT:[0-9_]+]]
; PTX: add.s64 %rd[[N]], %rd[[N]], -1
; PTX: add.s64 %rd{{[0-9]}}, %rd{{[0-9]}}, -1
; PTX: ld.u8 %rs[[ELEMENT:[0-9]+]]
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]]
; -- this is the forwards copying BB
@ -126,7 +127,7 @@ entry:
; PTX: @%p[[NEQ0]] bra LBB[[EXIT]]
; PTX: ld.u8 %rs[[ELEMENT2:[0-9]+]]
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]]
; PTX: add.s64 %rd[[INDEX:[0-9]+]], %rd[[INDEX]], 1
; PTX: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 1
; -- exit block
; PTX: LBB[[EXIT]]:
; PTX-NEXT: st.param.b64 [func_retval0

View File

@ -0,0 +1,813 @@
; Verifies correctness of load/store of parameters and return values.
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s
%s_i1 = type { i1 }
%s_i8 = type { i8 }
%s_i16 = type { i16 }
%s_half = type { half }
%s_i32 = type { i32 }
%s_float = type { float }
%s_i64 = type { i64 }
%s_f64 = type { double }
; More complicated types. i64 is used to increase natural alignment
; requirement for the type.
%s_i32x4 = type { i32, i32, i32, i32, i64}
%s_i32f32 = type { i32, float, i32, float, i64}
%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64}
%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}>
%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]}
; All scalar parameters must be at least 32 bits in size.
; i1 is loaded/stored as i8.
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_i1(
; CHECK-NEXT: .param .b32 test_i1_param_0
; CHECK: ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0];
; CHECK: and.b32 [[A:%r[0-9]+]], [[A8]], 1;
; CHECK: .param .b32 param0;
; CHECK: st.param.b32 [param0+0], [[A]]
; CHECK: .param .b32 retval0;
; CHECK: call.uni
; CHECK-NEXT: test_i1,
; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0];
; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1;
; CHECK: st.param.b32 [func_retval0+0], [[R]];
; CHECK: ret;
define i1 @test_i1(i1 %a) {
%r = tail call i1 @test_i1(i1 %a);
ret i1 %r;
}
; Signed i1 is a somewhat special case. We only care about one bit and
; then us neg.s32 to convert it to 32-bit -1 if it's set.
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_i1s(
; CHECK-NEXT: .param .b32 test_i1s_param_0
; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]];
; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1;
; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]];
; CHECK: .param .b32 param0;
; CHECK: st.param.b32 [param0+0], [[A]];
; CHECK: .param .b32 retval0;
; CHECK: call.uni
; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0];
; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1;
; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]];
; CHECK: st.param.b32 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define signext i1 @test_i1s(i1 signext %a) {
%r = tail call signext i1 @test_i1s(i1 signext %a);
ret i1 %r;
}
; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment.
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
; CHECK-LABEL: test_v3i1(
; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4]
; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0]
; CHECK: .param .align 4 .b8 param0[4];
; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]};
; CHECK-DAG: st.param.b8 [param0+2], [[E2]];
; CHECK: .param .align 4 .b8 retval0[4];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v3i1,
; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2];
; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}
; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]];
; CHECK-NEXT: ret;
define <3 x i1> @test_v3i1(<3 x i1> %a) {
%r = tail call <3 x i1> @test_v3i1(<3 x i1> %a);
ret <3 x i1> %r;
}
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
; CHECK-LABEL: test_v4i1(
; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4]
; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0]
; CHECK: .param .align 4 .b8 param0[4];
; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
; CHECK: .param .align 4 .b8 retval0[4];
; CHECK: call.uni (retval0),
; CHECK: test_v4i1,
; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]};
; CHECK-NEXT: ret;
define <4 x i1> @test_v4i1(<4 x i1> %a) {
%r = tail call <4 x i1> @test_v4i1(<4 x i1> %a);
ret <4 x i1> %r;
}
; CHECK: .func (.param .align 8 .b8 func_retval0[8])
; CHECK-LABEL: test_v5i1(
; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8]
; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0]
; CHECK: .param .align 8 .b8 param0[8];
; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
; CHECK: .param .align 8 .b8 retval0[8];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v5i1,
; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4];
; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]];
; CHECK-NEXT: ret;
define <5 x i1> @test_v5i1(<5 x i1> %a) {
%r = tail call <5 x i1> @test_v5i1(<5 x i1> %a);
ret <5 x i1> %r;
}
; Unsigned i8 is loaded directly into 32-bit register.
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_i8(
; CHECK-NEXT: .param .b32 test_i8_param_0
; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0];
; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]];
; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255;
; CHECK: .param .b32 param0;
; CHECK: st.param.b32 [param0+0], [[A]];
; CHECK: .param .b32 retval0;
; CHECK: call.uni (retval0),
; CHECK: test_i8,
; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0];
; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255;
; CHECK: st.param.b32 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define i8 @test_i8(i8 %a) {
%r = tail call i8 @test_i8(i8 %a);
ret i8 %r;
}
; signed i8 is loaded into 16-bit register which is then sign-extended to i32.
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_i8s(
; CHECK-NEXT: .param .b32 test_i8s_param_0
; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0];
; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]];
; CHECK: .param .b32 param0;
; CHECK: st.param.b32 [param0+0], [[A]];
; CHECK: .param .b32 retval0;
; CHECK: call.uni (retval0),
; CHECK: test_i8s,
; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0];
; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ?
; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]];
; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]];
; CHECK: st.param.b32 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define signext i8 @test_i8s(i8 signext %a) {
%r = tail call signext i8 @test_i8s(i8 signext %a);
ret i8 %r;
}
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
; CHECK-LABEL: test_v3i8(
; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2];
; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0];
; CHECK: .param .align 4 .b8 param0[4];
; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]};
; CHECK: st.param.b8 [param0+2], [[E2]];
; CHECK: .param .align 4 .b8 retval0[4];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v3i8,
; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2];
; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]};
; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]];
; CHECK-NEXT: ret;
define <3 x i8> @test_v3i8(<3 x i8> %a) {
%r = tail call <3 x i8> @test_v3i8(<3 x i8> %a);
ret <3 x i8> %r;
}
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
; CHECK-LABEL: test_v4i8(
; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0]
; CHECK: .param .align 4 .b8 param0[4];
; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
; CHECK: .param .align 4 .b8 retval0[4];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v4i8,
; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
; CHECK-NEXT: ret;
define <4 x i8> @test_v4i8(<4 x i8> %a) {
%r = tail call <4 x i8> @test_v4i8(<4 x i8> %a);
ret <4 x i8> %r;
}
; CHECK: .func (.param .align 8 .b8 func_retval0[8])
; CHECK-LABEL: test_v5i8(
; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8]
; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
; CHECK-DAG ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0]
; CHECK: .param .align 8 .b8 param0[8];
; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
; CHECK: .param .align 8 .b8 retval0[8];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v5i8,
; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4];
; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]];
; CHECK-NEXT: ret;
define <5 x i8> @test_v5i8(<5 x i8> %a) {
%r = tail call <5 x i8> @test_v5i8(<5 x i8> %a);
ret <5 x i8> %r;
}
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_i16(
; CHECK-NEXT: .param .b32 test_i16_param_0
; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0];
; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]];
; CHECK: .param .b32 param0;
; CHECK: st.param.b32 [param0+0], [[E32]];
; CHECK: .param .b32 retval0;
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_i16,
; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0];
; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535;
; CHECK: st.param.b32 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define i16 @test_i16(i16 %a) {
%r = tail call i16 @test_i16(i16 %a);
ret i16 %r;
}
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_i16s(
; CHECK-NEXT: .param .b32 test_i16s_param_0
; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0];
; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]];
; CHECK: .param .b32 param0;
; CHECK: st.param.b32 [param0+0], [[E32]];
; CHECK: .param .b32 retval0;
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_i16s,
; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0];
; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]];
; CHECK: st.param.b32 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define signext i16 @test_i16s(i16 signext %a) {
%r = tail call signext i16 @test_i16s(i16 signext %a);
ret i16 %r;
}
; CHECK: .func (.param .align 8 .b8 func_retval0[8])
; CHECK-LABEL: test_v3i16(
; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8]
; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0];
; CHECK: .param .align 8 .b8 param0[8];
; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
; CHECK: st.param.b16 [param0+4], [[E2]];
; CHECK: .param .align 8 .b8 retval0[8];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v3i16,
; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4];
; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]};
; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]];
; CHECK-NEXT: ret;
define <3 x i16> @test_v3i16(<3 x i16> %a) {
%r = tail call <3 x i16> @test_v3i16(<3 x i16> %a);
ret <3 x i16> %r;
}
; CHECK: .func (.param .align 8 .b8 func_retval0[8])
; CHECK-LABEL: test_v4i16(
; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8]
; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0]
; CHECK: .param .align 8 .b8 param0[8];
; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
; CHECK: .param .align 8 .b8 retval0[8];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v4i16,
; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
; CHECK-NEXT: ret;
define <4 x i16> @test_v4i16(<4 x i16> %a) {
%r = tail call <4 x i16> @test_v4i16(<4 x i16> %a);
ret <4 x i16> %r;
}
; CHECK: .func (.param .align 16 .b8 func_retval0[16])
; CHECK-LABEL: test_v5i16(
; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16]
; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
; CHECK-DAG ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
; CHECK: .param .align 16 .b8 param0[16];
; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
; CHECK: .param .align 16 .b8 retval0[16];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v5i16,
; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8];
; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]];
; CHECK-NEXT: ret;
define <5 x i16> @test_v5i16(<5 x i16> %a) {
%r = tail call <5 x i16> @test_v5i16(<5 x i16> %a);
ret <5 x i16> %r;
}
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_half(
; CHECK-NEXT: .param .b32 test_half_param_0
; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_half_param_0];
; CHECK: .param .b32 param0;
; CHECK: st.param.b16 [param0+0], [[E]];
; CHECK: .param .b32 retval0;
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_half,
; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
; CHECK: st.param.b16 [func_retval0+0], [[R]]
; CHECK-NEXT: ret;
define half @test_half(half %a) {
%r = tail call half @test_half(half %a);
ret half %r;
}
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_i32(
; CHECK-NEXT: .param .b32 test_i32_param_0
; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0];
; CHECK: .param .b32 param0;
; CHECK: st.param.b32 [param0+0], [[E]];
; CHECK: .param .b32 retval0;
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_i32,
; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0];
; CHECK: st.param.b32 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define i32 @test_i32(i32 %a) {
%r = tail call i32 @test_i32(i32 %a);
ret i32 %r;
}
; CHECK: .func (.param .align 16 .b8 func_retval0[16])
; CHECK-LABEL: test_v3i32(
; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16]
; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
; CHECK: .param .align 16 .b8 param0[16];
; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
; CHECK: st.param.b32 [param0+8], [[E2]];
; CHECK: .param .align 16 .b8 retval0[16];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v3i32,
; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]];
; CHECK-NEXT: ret;
define <3 x i32> @test_v3i32(<3 x i32> %a) {
%r = tail call <3 x i32> @test_v3i32(<3 x i32> %a);
ret <3 x i32> %r;
}
; CHECK: .func (.param .align 16 .b8 func_retval0[16])
; CHECK-LABEL: test_v4i32(
; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16]
; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
; CHECK: .param .align 16 .b8 param0[16];
; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
; CHECK: .param .align 16 .b8 retval0[16];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v4i32,
; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
; CHCK-NEXT: ret;
define <4 x i32> @test_v4i32(<4 x i32> %a) {
%r = tail call <4 x i32> @test_v4i32(<4 x i32> %a);
ret <4 x i32> %r;
}
; CHECK: .func (.param .align 32 .b8 func_retval0[32])
; CHECK-LABEL: test_v5i32(
; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32]
; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
; CHECK-DAG ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
; CHECK: .param .align 32 .b8 param0[32];
; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
; CHECK-DAG: st.param.b32 [param0+16], [[E4]];
; CHECK: .param .align 32 .b8 retval0[32];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v5i32,
; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16];
; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]];
; CHECK-NEXT: ret;
define <5 x i32> @test_v5i32(<5 x i32> %a) {
%r = tail call <5 x i32> @test_v5i32(<5 x i32> %a);
ret <5 x i32> %r;
}
; CHECK: .func (.param .b32 func_retval0)
; CHECK-LABEL: test_float(
; CHECK-NEXT: .param .b32 test_float_param_0
; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_float_param_0];
; CHECK: .param .b32 param0;
; CHECK: st.param.f32 [param0+0], [[E]];
; CHECK: .param .b32 retval0;
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_float,
; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0];
; CHECK: st.param.f32 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define float @test_float(float %a) {
%r = tail call float @test_float(float %a);
ret float %r;
}
; CHECK: .func (.param .b64 func_retval0)
; CHECK-LABEL: test_i64(
; CHECK-NEXT: .param .b64 test_i64_param_0
; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0];
; CHECK: .param .b64 param0;
; CHECK: st.param.b64 [param0+0], [[E]];
; CHECK: .param .b64 retval0;
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_i64,
; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0];
; CHECK: st.param.b64 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define i64 @test_i64(i64 %a) {
%r = tail call i64 @test_i64(i64 %a);
ret i64 %r;
}
; CHECK: .func (.param .align 32 .b8 func_retval0[32])
; CHECK-LABEL: test_v3i64(
; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32]
; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
; CHECK: .param .align 32 .b8 param0[32];
; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]};
; CHECK: st.param.b64 [param0+16], [[E2]];
; CHECK: .param .align 32 .b8 retval0[32];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v3i64,
; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16];
; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]];
; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]];
; CHECK-NEXT: ret;
define <3 x i64> @test_v3i64(<3 x i64> %a) {
%r = tail call <3 x i64> @test_v3i64(<3 x i64> %a);
ret <3 x i64> %r;
}
; For i64 vector loads are limited by PTX to 2 elements.
; CHECK: .func (.param .align 32 .b8 func_retval0[32])
; CHECK-LABEL: test_v4i64(
; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32]
; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
; CHECK: .param .align 32 .b8 param0[32];
; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]};
; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]};
; CHECK: .param .align 32 .b8 retval0[32];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_v4i64,
; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16];
; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]};
; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
; CHECK-NEXT: ret;
define <4 x i64> @test_v4i64(<4 x i64> %a) {
%r = tail call <4 x i64> @test_v4i64(<4 x i64> %a);
ret <4 x i64> %r;
}
; Aggregates, on the other hand, do not get extended.
; CHECK: .func (.param .align 1 .b8 func_retval0[1])
; CHECK-LABEL: test_s_i1(
; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1]
; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
; CHECK: .param .align 1 .b8 param0[1];
; CHECK: st.param.b8 [param0+0], [[A]]
; CHECK: .param .align 1 .b8 retval0[1];
; CHECK: call.uni
; CHECK-NEXT: test_s_i1,
; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0];
; CHECK: st.param.b8 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define %s_i1 @test_s_i1(%s_i1 %a) {
%r = tail call %s_i1 @test_s_i1(%s_i1 %a);
ret %s_i1 %r;
}
; CHECK: .func (.param .align 1 .b8 func_retval0[1])
; CHECK-LABEL: test_s_i8(
; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1]
; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
; CHECK: .param .align 1 .b8 param0[1];
; CHECK: st.param.b8 [param0+0], [[A]]
; CHECK: .param .align 1 .b8 retval0[1];
; CHECK: call.uni
; CHECK-NEXT: test_s_i8,
; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0];
; CHECK: st.param.b8 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define %s_i8 @test_s_i8(%s_i8 %a) {
%r = tail call %s_i8 @test_s_i8(%s_i8 %a);
ret %s_i8 %r;
}
; CHECK: .func (.param .align 2 .b8 func_retval0[2])
; CHECK-LABEL: test_s_i16(
; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2]
; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
; CHECK: .param .align 2 .b8 param0[2];
; CHECK: st.param.b16 [param0+0], [[A]]
; CHECK: .param .align 2 .b8 retval0[2];
; CHECK: call.uni
; CHECK-NEXT: test_s_i16,
; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0];
; CHECK: st.param.b16 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define %s_i16 @test_s_i16(%s_i16 %a) {
%r = tail call %s_i16 @test_s_i16(%s_i16 %a);
ret %s_i16 %r;
}
; CHECK: .func (.param .align 2 .b8 func_retval0[2])
; CHECK-LABEL: test_s_half(
; CHECK-NEXT: .param .align 2 .b8 test_s_half_param_0[2]
; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_half_param_0];
; CHECK: .param .align 2 .b8 param0[2];
; CHECK: st.param.b16 [param0+0], [[A]]
; CHECK: .param .align 2 .b8 retval0[2];
; CHECK: call.uni
; CHECK-NEXT: test_s_half,
; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
; CHECK: st.param.b16 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define %s_half @test_s_half(%s_half %a) {
%r = tail call %s_half @test_s_half(%s_half %a);
ret %s_half %r;
}
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
; CHECK-LABEL: test_s_i32(
; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4]
; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0];
; CHECK: .param .align 4 .b8 param0[4]
; CHECK: st.param.b32 [param0+0], [[E]];
; CHECK: .param .align 4 .b8 retval0[4];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_s_i32,
; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0];
; CHECK: st.param.b32 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define %s_i32 @test_s_i32(%s_i32 %a) {
%r = tail call %s_i32 @test_s_i32(%s_i32 %a);
ret %s_i32 %r;
}
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
; CHECK-LABEL: test_s_float(
; CHECK-NEXT: .param .align 4 .b8 test_s_float_param_0[4]
; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_float_param_0];
; CHECK: .param .align 4 .b8 param0[4]
; CHECK: st.param.f32 [param0+0], [[E]];
; CHECK: .param .align 4 .b8 retval0[4];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_s_float,
; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0];
; CHECK: st.param.f32 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define %s_float @test_s_float(%s_float %a) {
%r = tail call %s_float @test_s_float(%s_float %a);
ret %s_float %r;
}
; CHECK: .func (.param .align 8 .b8 func_retval0[8])
; CHECK-LABEL: test_s_i64(
; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8]
; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0];
; CHECK: .param .align 8 .b8 param0[8];
; CHECK: st.param.b64 [param0+0], [[E]];
; CHECK: .param .align 8 .b8 retval0[8];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_s_i64,
; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0];
; CHECK: st.param.b64 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
define %s_i64 @test_s_i64(%s_i64 %a) {
%r = tail call %s_i64 @test_s_i64(%s_i64 %a);
ret %s_i64 %r;
}
; Fields that have different types, but identical sizes are not vectorized.
; CHECK: .func (.param .align 8 .b8 func_retval0[24])
; CHECK-LABEL: test_s_i32f32(
; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24]
; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16];
; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12];
; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8];
; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4];
; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
; CHECK: .param .align 8 .b8 param0[24];
; CHECK-DAG: st.param.b32 [param0+0], [[E0]];
; CHECK-DAG: st.param.f32 [param0+4], [[E1]];
; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
; CHECK-DAG: st.param.f32 [param0+12], [[E3]];
; CHECK-DAG: st.param.b64 [param0+16], [[E4]];
; CHECK: .param .align 8 .b8 retval0[24];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_s_i32f32,
; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0];
; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4];
; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12];
; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16];
; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]];
; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]];
; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]];
; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]];
; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]];
; CHECK: ret;
define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
%r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a);
ret %s_i32f32 %r;
}
; We do vectorize consecutive fields with matching types.
; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24])
; CHECK-LABEL: test_s_i32x4(
; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24]
; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16];
; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
; CHECK: .param .align 8 .b8 param0[24];
; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
; CHECK: st.param.b64 [param0+16], [[E4]];
; CHECK: .param .align 8 .b8 retval0[24];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_s_i32x4,
; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8];
; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16];
; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]};
; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]];
; CHECK: ret;
define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
%r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a);
ret %s_i32x4 %r;
}
; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32])
; CHECK-LABEL: test_s_i1i32x4(
; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32]
; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24];
; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16];
; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12];
; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
; CHECK: .param .align 8 .b8 param0[32];
; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
; CHECK: st.param.b8 [param0+8], [[E2]];
; CHECK: st.param.b32 [param0+12], [[E3]];
; CHECK: st.param.b32 [param0+16], [[E4]];
; CHECK: st.param.b64 [param0+24], [[E5]];
; CHECK: .param .align 8 .b8 retval0[32];
; CHECK: call.uni (retval0),
; CHECK: test_s_i1i32x4,
; CHECK: (
; CHECK: param0
; CHECK: );
; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8];
; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12];
; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16];
; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24];
; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
; CHECK: st.param.b8 [func_retval0+8], [[RE2]];
; CHECK: st.param.b32 [func_retval0+12], [[RE3]];
; CHECK: st.param.b32 [func_retval0+16], [[RE4]];
; CHECK: st.param.b64 [func_retval0+24], [[RE5]];
; CHECK: ret;
define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
%r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a);
ret %s_i8i32x4 %r;
}
; -- All loads/stores from parameters aligned by one must be done one
; -- byte at a time.
; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25])
; CHECK-LABEL: test_s_i1i32x4p(
; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25]
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1];
; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0];
; --- TODO
; --- Unaligned parameter store/ return value load is broken in both nvcc
; --- and llvm and needs to be fixed.
; CHECK: .param .align 1 .b8 param0[25];
; CHECK-DAG: st.param.b32 [param0+0],
; CHECK-DAG: st.param.b32 [param0+4],
; CHECK-DAG: st.param.b8 [param0+8],
; CHECK-DAG: st.param.b32 [param0+9],
; CHECK-DAG: st.param.b32 [param0+13],
; CHECK-DAG: st.param.b64 [param0+17],
; CHECK: .param .align 1 .b8 retval0[25];
; CHECK: call.uni (retval0),
; CHECK-NEXT: test_s_i1i32x4p,
; CHECK-DAG: ld.param.b32 %r41, [retval0+0];
; CHECK-DAG: ld.param.b32 %r42, [retval0+4];
; CHECK-DAG: ld.param.b8 %rs2, [retval0+8];
; CHECK-DAG: ld.param.b32 %r43, [retval0+9];
; CHECK-DAG: ld.param.b32 %r44, [retval0+13];
; CHECK-DAG: ld.param.b64 %rd23, [retval0+17];
; CHECK-DAG: st.param.b32 [func_retval0+0],
; CHECK-DAG: st.param.b32 [func_retval0+4],
; CHECK-DAG: st.param.b8 [func_retval0+8],
; CHECK-DAG: st.param.b32 [func_retval0+9],
; CHECK-DAG: st.param.b32 [func_retval0+13],
; CHECK-DAG: st.param.b64 [func_retval0+17],
define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
%r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
ret %s_i8i32x4p %r;
}
; Check that we can vectorize loads that span multiple aggregate fields.
; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80])
; CHECK-LABEL: test_s_crossfield(
; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80]
; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64];
; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48];
; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32];
; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16];
; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
; CHECK: .param .align 16 .b8 param0[80];
; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
; CHECK: st.param.b32 [param0+8], [[E2]];
; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
; CHECK: st.param.b32 [param0+64], [[E15]];
; CHECK: .param .align 16 .b8 retval0[80];
; CHECK: call.uni (retval0),
; CHECK: test_s_crossfield,
; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16];
; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32];
; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48];
; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64];
; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
; CHECK: st.param.b32 [func_retval0+8], [[RE2]];
; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]};
; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]};
; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]};
; CHECK: st.param.b32 [func_retval0+64], [[RE15]];
; CHECK: ret;
define %s_crossfield @test_s_crossfield(%s_crossfield %a) {
%r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a);
ret %s_crossfield %r;
}

View File

@ -2,12 +2,81 @@
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
define <16 x float> @foo(<16 x float> %a) {
; Make sure we index into vectors properly
; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+48];
; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+32];
; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+16];
; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0];
define <16 x float> @test_v16f32(<16 x float> %a) {
; CHECK-LABEL: test_v16f32(
; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]}
; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]}
; CHECK-DAG: st.param.v4.f32 [func_retval0+32], {[[V_8_11]]}
; CHECK-DAG: st.param.v4.f32 [func_retval0+48], {[[V_12_15]]}
; CHECK: ret;
ret <16 x float> %a
}
define <8 x float> @test_v8f32(<8 x float> %a) {
; CHECK-LABEL: test_v8f32(
; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]}
; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]}
; CHECK: ret;
ret <8 x float> %a
}
define <4 x float> @test_v4f32(<4 x float> %a) {
; CHECK-LABEL: test_v4f32(
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]}
; CHECK: ret;
ret <4 x float> %a
}
define <2 x float> @test_v2f32(<2 x float> %a) {
; CHECK-LABEL: test_v2f32(
; CHECK-DAG: ld.param.v2.f32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_3]]}
; CHECK: ret;
ret <2 x float> %a
}
; Oddly shaped vectors should not load any extra elements.
define <3 x float> @test_v3f32(<3 x float> %a) {
; CHECK-LABEL: test_v3f32(
; CHECK-DAG: ld.param.f32 [[V_2:%f[0-9]+]], [test_v3f32_param_0+8];
; CHECK-DAG: ld.param.v2.f32 {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0];
; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_1]]}
; CHECK-DAG: st.param.f32 [func_retval0+8], [[V_2]]
; CHECK: ret;
ret <3 x float> %a
}
define <8 x i64> @test_v8i64(<8 x i64> %a) {
; CHECK-LABEL: test_v8i64(
; CHECK-DAG: ld.param.v2.u64 {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48];
; CHECK-DAG: ld.param.v2.u64 {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32];
; CHECK-DAG: ld.param.v2.u64 {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16];
; CHECK-DAG: ld.param.v2.u64 {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0];
; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[V_0_1]]}
; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_2_3]]}
; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_4_5]]}
; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_6_7]]}
; CHECK: ret;
ret <8 x i64> %a
}
define <16 x i16> @test_v16i16(<16 x i16> %a) {
; CHECK-LABEL: test_v16i16(
; CHECK-DAG: ld.param.v4.u16 {[[V_12_15:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+24];
; CHECK-DAG: ld.param.v4.u16 {[[V_8_11:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16];
; CHECK-DAG: ld.param.v4.u16 {[[V_4_7:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+8];
; CHECK-DAG: ld.param.v4.u16 {[[V_0_3:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0];
; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[V_0_3]]}
; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[V_4_7]]}
; CHECK-DAG: st.param.v4.b16 [func_retval0+16], {[[V_8_11]]}
; CHECK-DAG: st.param.v4.b16 [func_retval0+24], {[[V_12_15]]}
; CHECK: ret;
ret <16 x i16> %a
}

View File

@ -4,10 +4,15 @@ target triple = "nvptx-unknown-cuda"
; CHECK: .visible .func foo
define void @foo(<8 x i8> %a, i8* %b) {
%t0 = extractelement <8 x i8> %a, i32 0
; CHECK-DAG: ld.param.v4.u8
; CHECK-DAG: ld.param.u32
store i8 %t0, i8* %b
; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]
; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]
; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1]
; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]];
; CHECK: st.u8 [%[[B]]], [[T]];
%t0 = extractelement <8 x i8> %a, i32 1
%t1 = extractelement <8 x i8> %a, i32 6
%t = add i8 %t0, %t1
store i8 %t, i8* %b
ret void
}

View File

@ -4,9 +4,27 @@ target triple = "nvptx-unknown-cuda"
declare void @bar(<4 x i32>)
; CHECK-LABEL: @foo
; CHECK-LABEL: .func foo(
; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
; CHECK: .param .align 16 .b8 param0[16];
; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
; CHECK: call.uni
; CHECK: ret;
define void @foo(<4 x i32> %a) {
; CHECK: st.param.v4.b32
tail call void @bar(<4 x i32> %a)
ret void
}
; CHECK-LABEL: .func foo3(
; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8];
; CHECK: .param .align 16 .b8 param0[16];
; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
; CHECK: call.uni
; CHECK: ret;
declare void @bar3(<3 x i32>)
define void @foo3(<3 x i32> %a) {
tail call void @bar3(<3 x i32> %a)
ret void
}