diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll index 0dacc72541bf..6f8ab9248ca3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll @@ -2,34 +2,55 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s -; GCN-LABEL: @bswap_v2i16( -; GFX7: call i16 @llvm.bswap.i16( -; GFX7: call i16 @llvm.bswap.i16( - -; GFX8: call <2 x i16> @llvm.bswap.v2i16( define <2 x i16> @bswap_v2i16(<2 x i16> %arg) { +; GFX7-LABEL: @bswap_v2i16( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[T:%.*]] = extractelement <2 x i16> [[ARG:%.*]], i64 0 +; GFX7-NEXT: [[T1:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[T]]) +; GFX7-NEXT: [[T2:%.*]] = insertelement <2 x i16> poison, i16 [[T1]], i64 0 +; GFX7-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[ARG]], i64 1 +; GFX7-NEXT: [[T4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[T3]]) +; GFX7-NEXT: [[T5:%.*]] = insertelement <2 x i16> [[T2]], i16 [[T4]], i64 1 +; GFX7-NEXT: ret <2 x i16> [[T5]] +; +; GFX8-LABEL: @bswap_v2i16( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> [[ARG:%.*]]) +; GFX8-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[TMP0]], i32 0 +; GFX8-NEXT: [[T2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0 +; GFX8-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[TMP0]], i32 1 +; GFX8-NEXT: [[T5:%.*]] = insertelement <2 x i16> [[T2]], i16 [[TMP2]], i64 1 +; GFX8-NEXT: ret <2 x i16> [[T5]] +; bb: - %tmp = extractelement <2 x i16> %arg, i64 0 - %tmp1 = tail call i16 @llvm.bswap.i16(i16 %tmp) - %tmp2 = insertelement <2 x i16> poison, i16 %tmp1, i64 0 - %tmp3 = extractelement <2 x i16> %arg, i64 1 - %tmp4 = tail call i16 @llvm.bswap.i16(i16 %tmp3) - %tmp5 = insertelement <2 x i16> %tmp2, i16 %tmp4, i64 1 - ret <2 x i16> %tmp5 + %t = extractelement <2 x i16> %arg, i64 0 + %t1 = tail call i16 @llvm.bswap.i16(i16 %t) + %t2 = insertelement <2 x i16> poison, i16 %t1, i64 0 + %t3 = extractelement <2 x i16> %arg, i64 1 + %t4 = tail call i16 @llvm.bswap.i16(i16 %t3) + %t5 = insertelement <2 x i16> %t2, i16 %t4, i64 1 + ret <2 x i16> %t5 } -; GCN-LABEL: @bswap_v2i32( -; GCN: call i32 @llvm.bswap.i32 -; GCN: call i32 @llvm.bswap.i32 define <2 x i32> @bswap_v2i32(<2 x i32> %arg) { +; GCN-LABEL: @bswap_v2i32( +; GCN-NEXT: bb: +; GCN-NEXT: [[T:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[T1:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[T]]) +; GCN-NEXT: [[T2:%.*]] = insertelement <2 x i32> poison, i32 [[T1]], i64 0 +; GCN-NEXT: [[T3:%.*]] = extractelement <2 x i32> [[ARG]], i64 1 +; GCN-NEXT: [[T4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[T3]]) +; GCN-NEXT: [[T5:%.*]] = insertelement <2 x i32> [[T2]], i32 [[T4]], i64 1 +; GCN-NEXT: ret <2 x i32> [[T5]] +; bb: - %tmp = extractelement <2 x i32> %arg, i64 0 - %tmp1 = tail call i32 @llvm.bswap.i32(i32 %tmp) - %tmp2 = insertelement <2 x i32> poison, i32 %tmp1, i64 0 - %tmp3 = extractelement <2 x i32> %arg, i64 1 - %tmp4 = tail call i32 @llvm.bswap.i32(i32 %tmp3) - %tmp5 = insertelement <2 x i32> %tmp2, i32 %tmp4, i64 1 - ret <2 x i32> %tmp5 + %t = extractelement <2 x i32> %arg, i64 0 + %t1 = tail call i32 @llvm.bswap.i32(i32 %t) + %t2 = insertelement <2 x i32> poison, i32 %t1, i64 0 + %t3 = extractelement <2 x i32> %arg, i64 1 + %t4 = tail call i32 @llvm.bswap.i32(i32 %t3) + %t5 = insertelement <2 x i32> %t2, i32 %t4, i64 1 + ret <2 x i32> %t5 } declare i16 @llvm.bswap.i16(i16) #0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll index bf42d2f5ff64..5541e0295637 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll @@ -2,34 +2,55 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s -; GCN-LABEL: @bswap_v2i16( -; GFX7: call i16 @llvm.bswap.i16( -; GFX7: call i16 @llvm.bswap.i16( - -; GFX8: call <2 x i16> @llvm.bswap.v2i16( define <2 x i16> @bswap_v2i16(<2 x i16> %arg) { +; GFX7-LABEL: @bswap_v2i16( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[T:%.*]] = extractelement <2 x i16> [[ARG:%.*]], i64 0 +; GFX7-NEXT: [[T1:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[T]]) +; GFX7-NEXT: [[T2:%.*]] = insertelement <2 x i16> undef, i16 [[T1]], i64 0 +; GFX7-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[ARG]], i64 1 +; GFX7-NEXT: [[T4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[T3]]) +; GFX7-NEXT: [[T5:%.*]] = insertelement <2 x i16> [[T2]], i16 [[T4]], i64 1 +; GFX7-NEXT: ret <2 x i16> [[T5]] +; +; GFX8-LABEL: @bswap_v2i16( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> [[ARG:%.*]]) +; GFX8-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[TMP0]], i32 0 +; GFX8-NEXT: [[T2:%.*]] = insertelement <2 x i16> undef, i16 [[TMP1]], i64 0 +; GFX8-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[TMP0]], i32 1 +; GFX8-NEXT: [[T5:%.*]] = insertelement <2 x i16> [[T2]], i16 [[TMP2]], i64 1 +; GFX8-NEXT: ret <2 x i16> [[T5]] +; bb: - %tmp = extractelement <2 x i16> %arg, i64 0 - %tmp1 = tail call i16 @llvm.bswap.i16(i16 %tmp) - %tmp2 = insertelement <2 x i16> undef, i16 %tmp1, i64 0 - %tmp3 = extractelement <2 x i16> %arg, i64 1 - %tmp4 = tail call i16 @llvm.bswap.i16(i16 %tmp3) - %tmp5 = insertelement <2 x i16> %tmp2, i16 %tmp4, i64 1 - ret <2 x i16> %tmp5 + %t = extractelement <2 x i16> %arg, i64 0 + %t1 = tail call i16 @llvm.bswap.i16(i16 %t) + %t2 = insertelement <2 x i16> undef, i16 %t1, i64 0 + %t3 = extractelement <2 x i16> %arg, i64 1 + %t4 = tail call i16 @llvm.bswap.i16(i16 %t3) + %t5 = insertelement <2 x i16> %t2, i16 %t4, i64 1 + ret <2 x i16> %t5 } -; GCN-LABEL: @bswap_v2i32( -; GCN: call i32 @llvm.bswap.i32 -; GCN: call i32 @llvm.bswap.i32 define <2 x i32> @bswap_v2i32(<2 x i32> %arg) { +; GCN-LABEL: @bswap_v2i32( +; GCN-NEXT: bb: +; GCN-NEXT: [[T:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[T1:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[T]]) +; GCN-NEXT: [[T2:%.*]] = insertelement <2 x i32> undef, i32 [[T1]], i64 0 +; GCN-NEXT: [[T3:%.*]] = extractelement <2 x i32> [[ARG]], i64 1 +; GCN-NEXT: [[T4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[T3]]) +; GCN-NEXT: [[T5:%.*]] = insertelement <2 x i32> [[T2]], i32 [[T4]], i64 1 +; GCN-NEXT: ret <2 x i32> [[T5]] +; bb: - %tmp = extractelement <2 x i32> %arg, i64 0 - %tmp1 = tail call i32 @llvm.bswap.i32(i32 %tmp) - %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i64 0 - %tmp3 = extractelement <2 x i32> %arg, i64 1 - %tmp4 = tail call i32 @llvm.bswap.i32(i32 %tmp3) - %tmp5 = insertelement <2 x i32> %tmp2, i32 %tmp4, i64 1 - ret <2 x i32> %tmp5 + %t = extractelement <2 x i32> %arg, i64 0 + %t1 = tail call i32 @llvm.bswap.i32(i32 %t) + %t2 = insertelement <2 x i32> undef, i32 %t1, i64 0 + %t3 = extractelement <2 x i32> %arg, i64 1 + %t4 = tail call i32 @llvm.bswap.i32(i32 %t3) + %t5 = insertelement <2 x i32> %t2, i32 %t4, i64 1 + ret <2 x i32> %t5 } declare i16 @llvm.bswap.i16(i16) #0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll index b2e444931802..51ff7a568b77 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll @@ -2,34 +2,55 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s -; GCN-LABEL: @round_v2f16( -; GFX7: call half @llvm.round.f16( -; GFX7: call half @llvm.round.f16( - -; GFX8: call <2 x half> @llvm.round.v2f16( define <2 x half> @round_v2f16(<2 x half> %arg) { +; GFX7-LABEL: @round_v2f16( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[T:%.*]] = extractelement <2 x half> [[ARG:%.*]], i64 0 +; GFX7-NEXT: [[T1:%.*]] = tail call half @llvm.round.f16(half [[T]]) +; GFX7-NEXT: [[T2:%.*]] = insertelement <2 x half> poison, half [[T1]], i64 0 +; GFX7-NEXT: [[T3:%.*]] = extractelement <2 x half> [[ARG]], i64 1 +; GFX7-NEXT: [[T4:%.*]] = tail call half @llvm.round.f16(half [[T3]]) +; GFX7-NEXT: [[T5:%.*]] = insertelement <2 x half> [[T2]], half [[T4]], i64 1 +; GFX7-NEXT: ret <2 x half> [[T5]] +; +; GFX8-LABEL: @round_v2f16( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[TMP0:%.*]] = call <2 x half> @llvm.round.v2f16(<2 x half> [[ARG:%.*]]) +; GFX8-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[TMP0]], i32 0 +; GFX8-NEXT: [[T2:%.*]] = insertelement <2 x half> poison, half [[TMP1]], i64 0 +; GFX8-NEXT: [[TMP2:%.*]] = extractelement <2 x half> [[TMP0]], i32 1 +; GFX8-NEXT: [[T5:%.*]] = insertelement <2 x half> [[T2]], half [[TMP2]], i64 1 +; GFX8-NEXT: ret <2 x half> [[T5]] +; bb: - %tmp = extractelement <2 x half> %arg, i64 0 - %tmp1 = tail call half @llvm.round.half(half %tmp) - %tmp2 = insertelement <2 x half> poison, half %tmp1, i64 0 - %tmp3 = extractelement <2 x half> %arg, i64 1 - %tmp4 = tail call half @llvm.round.half(half %tmp3) - %tmp5 = insertelement <2 x half> %tmp2, half %tmp4, i64 1 - ret <2 x half> %tmp5 + %t = extractelement <2 x half> %arg, i64 0 + %t1 = tail call half @llvm.round.half(half %t) + %t2 = insertelement <2 x half> poison, half %t1, i64 0 + %t3 = extractelement <2 x half> %arg, i64 1 + %t4 = tail call half @llvm.round.half(half %t3) + %t5 = insertelement <2 x half> %t2, half %t4, i64 1 + ret <2 x half> %t5 } -; GCN-LABEL: @round_v2f32( -; GCN: call float @llvm.round.f32( -; GCN: call float @llvm.round.f32( define <2 x float> @round_v2f32(<2 x float> %arg) { +; GCN-LABEL: @round_v2f32( +; GCN-NEXT: bb: +; GCN-NEXT: [[T:%.*]] = extractelement <2 x float> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[T1:%.*]] = tail call float @llvm.round.f32(float [[T]]) +; GCN-NEXT: [[T2:%.*]] = insertelement <2 x float> poison, float [[T1]], i64 0 +; GCN-NEXT: [[T3:%.*]] = extractelement <2 x float> [[ARG]], i64 1 +; GCN-NEXT: [[T4:%.*]] = tail call float @llvm.round.f32(float [[T3]]) +; GCN-NEXT: [[T5:%.*]] = insertelement <2 x float> [[T2]], float [[T4]], i64 1 +; GCN-NEXT: ret <2 x float> [[T5]] +; bb: - %tmp = extractelement <2 x float> %arg, i64 0 - %tmp1 = tail call float @llvm.round.f32(float %tmp) - %tmp2 = insertelement <2 x float> poison, float %tmp1, i64 0 - %tmp3 = extractelement <2 x float> %arg, i64 1 - %tmp4 = tail call float @llvm.round.f32(float %tmp3) - %tmp5 = insertelement <2 x float> %tmp2, float %tmp4, i64 1 - ret <2 x float> %tmp5 + %t = extractelement <2 x float> %arg, i64 0 + %t1 = tail call float @llvm.round.f32(float %t) + %t2 = insertelement <2 x float> poison, float %t1, i64 0 + %t3 = extractelement <2 x float> %arg, i64 1 + %t4 = tail call float @llvm.round.f32(float %t3) + %t5 = insertelement <2 x float> %t2, float %t4, i64 1 + ret <2 x float> %t5 } declare half @llvm.round.half(half) #0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll index 7a8e08de4138..045abffa000f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll @@ -2,34 +2,56 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s -; GCN-LABEL: @round_v2f16( -; GFX7: call half @llvm.round.f16( -; GFX7: call half @llvm.round.f16( - -; GFX8: call <2 x half> @llvm.round.v2f16( define <2 x half> @round_v2f16(<2 x half> %arg) { +; GFX7-LABEL: @round_v2f16( +; GFX7-NEXT: bb: +; GFX7-NEXT: [[T:%.*]] = extractelement <2 x half> [[ARG:%.*]], i64 0 +; GFX7-NEXT: [[T1:%.*]] = tail call half @llvm.round.f16(half [[T]]) +; GFX7-NEXT: [[T2:%.*]] = insertelement <2 x half> undef, half [[T1]], i64 0 +; GFX7-NEXT: [[T3:%.*]] = extractelement <2 x half> [[ARG]], i64 1 +; GFX7-NEXT: [[T4:%.*]] = tail call half @llvm.round.f16(half [[T3]]) +; GFX7-NEXT: [[T5:%.*]] = insertelement <2 x half> [[T2]], half [[T4]], i64 1 +; GFX7-NEXT: ret <2 x half> [[T5]] +; +; GFX8-LABEL: @round_v2f16( +; GFX8-NEXT: bb: +; GFX8-NEXT: [[TMP0:%.*]] = call <2 x half> @llvm.round.v2f16(<2 x half> [[ARG:%.*]]) +; GFX8-NEXT: [[TMP1:%.*]] = extractelement <2 x half> [[TMP0]], i32 0 +; GFX8-NEXT: [[T2:%.*]] = insertelement <2 x half> undef, half [[TMP1]], i64 0 +; GFX8-NEXT: [[TMP2:%.*]] = extractelement <2 x half> [[TMP0]], i32 1 +; GFX8-NEXT: [[T5:%.*]] = insertelement <2 x half> [[T2]], half [[TMP2]], i64 1 +; GFX8-NEXT: ret <2 x half> [[T5]] +; bb: - %tmp = extractelement <2 x half> %arg, i64 0 - %tmp1 = tail call half @llvm.round.half(half %tmp) - %tmp2 = insertelement <2 x half> undef, half %tmp1, i64 0 - %tmp3 = extractelement <2 x half> %arg, i64 1 - %tmp4 = tail call half @llvm.round.half(half %tmp3) - %tmp5 = insertelement <2 x half> %tmp2, half %tmp4, i64 1 - ret <2 x half> %tmp5 + %t = extractelement <2 x half> %arg, i64 0 + %t1 = tail call half @llvm.round.half(half %t) + %t2 = insertelement <2 x half> undef, half %t1, i64 0 + %t3 = extractelement <2 x half> %arg, i64 1 + %t4 = tail call half @llvm.round.half(half %t3) + %t5 = insertelement <2 x half> %t2, half %t4, i64 1 + ret <2 x half> %t5 } -; GCN-LABEL: @round_v2f32( -; GCN: call float @llvm.round.f32( -; GCN: call float @llvm.round.f32( + define <2 x float> @round_v2f32(<2 x float> %arg) { +; GCN-LABEL: @round_v2f32( +; GCN-NEXT: bb: +; GCN-NEXT: [[T:%.*]] = extractelement <2 x float> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[T1:%.*]] = tail call float @llvm.round.f32(float [[T]]) +; GCN-NEXT: [[T2:%.*]] = insertelement <2 x float> undef, float [[T1]], i64 0 +; GCN-NEXT: [[T3:%.*]] = extractelement <2 x float> [[ARG]], i64 1 +; GCN-NEXT: [[T4:%.*]] = tail call float @llvm.round.f32(float [[T3]]) +; GCN-NEXT: [[T5:%.*]] = insertelement <2 x float> [[T2]], float [[T4]], i64 1 +; GCN-NEXT: ret <2 x float> [[T5]] +; bb: - %tmp = extractelement <2 x float> %arg, i64 0 - %tmp1 = tail call float @llvm.round.f32(float %tmp) - %tmp2 = insertelement <2 x float> undef, float %tmp1, i64 0 - %tmp3 = extractelement <2 x float> %arg, i64 1 - %tmp4 = tail call float @llvm.round.f32(float %tmp3) - %tmp5 = insertelement <2 x float> %tmp2, float %tmp4, i64 1 - ret <2 x float> %tmp5 + %t = extractelement <2 x float> %arg, i64 0 + %t1 = tail call float @llvm.round.f32(float %t) + %t2 = insertelement <2 x float> undef, float %t1, i64 0 + %t3 = extractelement <2 x float> %arg, i64 1 + %t4 = tail call float @llvm.round.f32(float %t3) + %t5 = insertelement <2 x float> %t2, float %t4, i64 1 + ret <2 x float> %t5 } declare half @llvm.round.half(half) #0