From 0c66deb498e6c2389988500b5037d91b81f79ed9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 7 Jun 2022 08:07:49 -0700 Subject: [PATCH] [RISCV] Scalarize gather/scatter on RV64 with Zve32* extension. i64 indices aren't supported on Zve32*. Scalarize gathers to prevent generating illegal instructions. Since InstCombine will aggressively canonicalize GEP indices to pointer size, we're pretty much always going to have an i64 index. Trying to predict when SelectionDAG will find a smaller index from the TTI hook used by the ScalarizeMaskedMemIntrinPass seems fragile. To optimize this we probably need an IR pass to rewrite it earlier. Test RUN lines have also been added to make sure the strided load/store optimization still works. Reviewed By: reames Differential Revision: https://reviews.llvm.org/D127179 --- .../Target/RISCV/RISCVTargetTransformInfo.h | 10 + .../fixed-vector-strided-load-store-asm.ll | 275 +- .../rvv/fixed-vector-strided-load-store.ll | 145 +- .../RISCV/rvv/fixed-vectors-masked-gather.ll | 13757 ++++++++++++++-- .../RISCV/rvv/fixed-vectors-masked-scatter.ll | 10674 +++++++++++- 5 files changed, 23277 insertions(+), 1584 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 52e20e941157..03a4caada958 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -161,6 +161,16 @@ public: return isLegalMaskedGatherScatter(DataType, Alignment); } + bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { + // Scalarize masked gather for RV64 if EEW=64 indices aren't supported. + return ST->is64Bit() && !ST->hasVInstructionsI64(); + } + + bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) { + // Scalarize masked scatter for RV64 if EEW=64 indices aren't supported. + return ST->is64Bit() && !ST->hasVInstructionsI64(); + } + /// \returns How the target needs this vector-predicated operation to be /// transformed. TargetTransformInfo::VPLegalization diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll index 430b3645dc53..1a4f53cfd5b2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,V +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,ZVE32F %struct.foo = type { i32, i32, i32, i32 } @@ -54,30 +55,55 @@ for.cond.cleanup: ; preds = %vector.body define void @gather_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) { ; -; CHECK-LABEL: gather_masked: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: lui a3, 983765 -; CHECK-NEXT: addiw a3, a3, 873 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vmv.s.x v0, a3 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: li a4, 5 -; CHECK-NEXT: li a5, 1024 -; CHECK-NEXT: .LBB1_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-NEXT: vmv1r.v v9, v8 -; CHECK-NEXT: vlse8.v v9, (a1), a4, v0.t -; CHECK-NEXT: add a6, a0, a2 -; CHECK-NEXT: vle8.v v10, (a6) -; CHECK-NEXT: vadd.vv v9, v10, v9 -; CHECK-NEXT: vse8.v v9, (a6) -; CHECK-NEXT: addi a2, a2, 32 -; CHECK-NEXT: addi a1, a1, 160 -; CHECK-NEXT: bne a2, a5, .LBB1_1 -; CHECK-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-NEXT: ret +; V-LABEL: gather_masked: +; V: # %bb.0: # %entry +; V-NEXT: li a2, 0 +; V-NEXT: lui a3, 983765 +; V-NEXT: addiw a3, a3, 873 +; V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; V-NEXT: vmv.s.x v0, a3 +; V-NEXT: li a3, 32 +; V-NEXT: li a4, 5 +; V-NEXT: li a5, 1024 +; V-NEXT: .LBB1_1: # %vector.body +; V-NEXT: # =>This Inner Loop Header: Depth=1 +; V-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; V-NEXT: vmv1r.v v9, v8 +; V-NEXT: vlse8.v v9, (a1), a4, v0.t +; V-NEXT: add a6, a0, a2 +; V-NEXT: vle8.v v10, (a6) +; V-NEXT: vadd.vv v9, v10, v9 +; V-NEXT: vse8.v v9, (a6) +; V-NEXT: addi a2, a2, 32 +; V-NEXT: addi a1, a1, 160 +; V-NEXT: bne a2, a5, .LBB1_1 +; V-NEXT: # %bb.2: # %for.cond.cleanup +; V-NEXT: ret +; +; ZVE32F-LABEL: gather_masked: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: li a2, 0 +; ZVE32F-NEXT: lui a3, 983765 +; ZVE32F-NEXT: addiw a3, a3, 873 +; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; ZVE32F-NEXT: vmv.s.x v0, a3 +; ZVE32F-NEXT: li a3, 32 +; ZVE32F-NEXT: li a4, 5 +; ZVE32F-NEXT: li a5, 1024 +; ZVE32F-NEXT: .LBB1_1: # %vector.body +; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 +; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; ZVE32F-NEXT: vmv1r.v v9, v8 +; ZVE32F-NEXT: vlse8.v v9, (a1), a4, v0.t +; ZVE32F-NEXT: add a6, a0, a2 +; ZVE32F-NEXT: vle8.v v10, (a6) +; ZVE32F-NEXT: vadd.vv v9, v10, v9 +; ZVE32F-NEXT: vse8.v v9, (a6) +; ZVE32F-NEXT: addi a2, a2, 32 +; ZVE32F-NEXT: addi a1, a1, 160 +; ZVE32F-NEXT: bne a2, a5, .LBB1_1 +; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup +; ZVE32F-NEXT: ret entry: br label %vector.body @@ -242,30 +268,55 @@ for.cond.cleanup: ; preds = %vector.body define void @scatter_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) { ; -; CHECK-LABEL: scatter_masked: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: lui a4, 983765 -; CHECK-NEXT: addiw a4, a4, 873 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vmv.s.x v0, a4 -; CHECK-NEXT: li a4, 5 -; CHECK-NEXT: li a5, 1024 -; CHECK-NEXT: .LBB5_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add a6, a1, a2 -; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-NEXT: vle8.v v9, (a6) -; CHECK-NEXT: vmv1r.v v10, v8 -; CHECK-NEXT: vlse8.v v10, (a0), a4, v0.t -; CHECK-NEXT: vadd.vv v9, v10, v9 -; CHECK-NEXT: vsse8.v v9, (a0), a4, v0.t -; CHECK-NEXT: addi a2, a2, 32 -; CHECK-NEXT: addi a0, a0, 160 -; CHECK-NEXT: bne a2, a5, .LBB5_1 -; CHECK-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-NEXT: ret +; V-LABEL: scatter_masked: +; V: # %bb.0: # %entry +; V-NEXT: li a2, 0 +; V-NEXT: li a3, 32 +; V-NEXT: lui a4, 983765 +; V-NEXT: addiw a4, a4, 873 +; V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; V-NEXT: vmv.s.x v0, a4 +; V-NEXT: li a4, 5 +; V-NEXT: li a5, 1024 +; V-NEXT: .LBB5_1: # %vector.body +; V-NEXT: # =>This Inner Loop Header: Depth=1 +; V-NEXT: add a6, a1, a2 +; V-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; V-NEXT: vle8.v v9, (a6) +; V-NEXT: vmv1r.v v10, v8 +; V-NEXT: vlse8.v v10, (a0), a4, v0.t +; V-NEXT: vadd.vv v9, v10, v9 +; V-NEXT: vsse8.v v9, (a0), a4, v0.t +; V-NEXT: addi a2, a2, 32 +; V-NEXT: addi a0, a0, 160 +; V-NEXT: bne a2, a5, .LBB5_1 +; V-NEXT: # %bb.2: # %for.cond.cleanup +; V-NEXT: ret +; +; ZVE32F-LABEL: scatter_masked: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: li a2, 0 +; ZVE32F-NEXT: li a3, 32 +; ZVE32F-NEXT: lui a4, 983765 +; ZVE32F-NEXT: addiw a4, a4, 873 +; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; ZVE32F-NEXT: vmv.s.x v0, a4 +; ZVE32F-NEXT: li a4, 5 +; ZVE32F-NEXT: li a5, 1024 +; ZVE32F-NEXT: .LBB5_1: # %vector.body +; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 +; ZVE32F-NEXT: add a6, a1, a2 +; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; ZVE32F-NEXT: vle8.v v9, (a6) +; ZVE32F-NEXT: vmv1r.v v10, v8 +; ZVE32F-NEXT: vlse8.v v10, (a0), a4, v0.t +; ZVE32F-NEXT: vadd.vv v9, v10, v9 +; ZVE32F-NEXT: vsse8.v v9, (a0), a4, v0.t +; ZVE32F-NEXT: addi a2, a2, 32 +; ZVE32F-NEXT: addi a0, a0, 160 +; ZVE32F-NEXT: bne a2, a5, .LBB5_1 +; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup +; ZVE32F-NEXT: ret entry: br label %vector.body @@ -554,24 +605,51 @@ declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32 immar ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. define void @gather_of_pointers(i32** noalias nocapture %0, i32** noalias nocapture readonly %1) { ; -; CHECK-LABEL: gather_of_pointers: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: li a3, 40 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vlse64.v v8, (a1), a3 -; CHECK-NEXT: addi a4, a1, 80 -; CHECK-NEXT: vlse64.v v9, (a4), a3 -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: addi a4, a0, 16 -; CHECK-NEXT: vse64.v v9, (a4) -; CHECK-NEXT: addi a2, a2, -4 -; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: addi a1, a1, 160 -; CHECK-NEXT: bnez a2, .LBB10_1 -; CHECK-NEXT: # %bb.2: -; CHECK-NEXT: ret +; V-LABEL: gather_of_pointers: +; V: # %bb.0: +; V-NEXT: li a2, 1024 +; V-NEXT: li a3, 40 +; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 +; V-NEXT: vlse64.v v8, (a1), a3 +; V-NEXT: addi a4, a1, 80 +; V-NEXT: vlse64.v v9, (a4), a3 +; V-NEXT: vse64.v v8, (a0) +; V-NEXT: addi a4, a0, 16 +; V-NEXT: vse64.v v9, (a4) +; V-NEXT: addi a2, a2, -4 +; V-NEXT: addi a0, a0, 32 +; V-NEXT: addi a1, a1, 160 +; V-NEXT: bnez a2, .LBB10_1 +; V-NEXT: # %bb.2: +; V-NEXT: ret +; +; ZVE32F-LABEL: gather_of_pointers: +; ZVE32F: # %bb.0: +; ZVE32F-NEXT: li a2, 0 +; ZVE32F-NEXT: li a3, 1 +; ZVE32F-NEXT: li a4, 1024 +; ZVE32F-NEXT: li a5, 40 +; ZVE32F-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 +; ZVE32F-NEXT: mul a6, a3, a5 +; ZVE32F-NEXT: add a6, a1, a6 +; ZVE32F-NEXT: mul a7, a2, a5 +; ZVE32F-NEXT: add a7, a1, a7 +; ZVE32F-NEXT: ld t0, 0(a6) +; ZVE32F-NEXT: ld t1, 0(a7) +; ZVE32F-NEXT: ld a6, 80(a6) +; ZVE32F-NEXT: ld a7, 80(a7) +; ZVE32F-NEXT: sd t0, 8(a0) +; ZVE32F-NEXT: sd t1, 0(a0) +; ZVE32F-NEXT: sd a6, 24(a0) +; ZVE32F-NEXT: sd a7, 16(a0) +; ZVE32F-NEXT: addi a2, a2, 4 +; ZVE32F-NEXT: addi a3, a3, 4 +; ZVE32F-NEXT: addi a4, a4, -4 +; ZVE32F-NEXT: addi a0, a0, 32 +; ZVE32F-NEXT: bnez a4, .LBB10_1 +; ZVE32F-NEXT: # %bb.2: +; ZVE32F-NEXT: ret br label %3 3: ; preds = %3, %2 @@ -604,24 +682,51 @@ declare <2 x i32*> @llvm.masked.gather.v2p0i32.v2p0p0i32(<2 x i32**>, i32 immarg ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. define void @scatter_of_pointers(i32** noalias nocapture %0, i32** noalias nocapture readonly %1) { ; -; CHECK-LABEL: scatter_of_pointers: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: li a3, 40 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle64.v v8, (a1) -; CHECK-NEXT: addi a4, a1, 16 -; CHECK-NEXT: vle64.v v9, (a4) -; CHECK-NEXT: addi a4, a0, 80 -; CHECK-NEXT: vsse64.v v8, (a0), a3 -; CHECK-NEXT: vsse64.v v9, (a4), a3 -; CHECK-NEXT: addi a2, a2, -4 -; CHECK-NEXT: addi a1, a1, 32 -; CHECK-NEXT: addi a0, a0, 160 -; CHECK-NEXT: bnez a2, .LBB11_1 -; CHECK-NEXT: # %bb.2: -; CHECK-NEXT: ret +; V-LABEL: scatter_of_pointers: +; V: # %bb.0: +; V-NEXT: li a2, 1024 +; V-NEXT: li a3, 40 +; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 +; V-NEXT: vle64.v v8, (a1) +; V-NEXT: addi a4, a1, 16 +; V-NEXT: vle64.v v9, (a4) +; V-NEXT: addi a4, a0, 80 +; V-NEXT: vsse64.v v8, (a0), a3 +; V-NEXT: vsse64.v v9, (a4), a3 +; V-NEXT: addi a2, a2, -4 +; V-NEXT: addi a1, a1, 32 +; V-NEXT: addi a0, a0, 160 +; V-NEXT: bnez a2, .LBB11_1 +; V-NEXT: # %bb.2: +; V-NEXT: ret +; +; ZVE32F-LABEL: scatter_of_pointers: +; ZVE32F: # %bb.0: +; ZVE32F-NEXT: li a2, 0 +; ZVE32F-NEXT: li a3, 1 +; ZVE32F-NEXT: li a4, 1024 +; ZVE32F-NEXT: li a5, 40 +; ZVE32F-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 +; ZVE32F-NEXT: ld a6, 8(a1) +; ZVE32F-NEXT: ld a7, 0(a1) +; ZVE32F-NEXT: ld t0, 24(a1) +; ZVE32F-NEXT: ld t1, 16(a1) +; ZVE32F-NEXT: mul t2, a3, a5 +; ZVE32F-NEXT: add t2, a0, t2 +; ZVE32F-NEXT: mul t3, a2, a5 +; ZVE32F-NEXT: add t3, a0, t3 +; ZVE32F-NEXT: sd a7, 0(t3) +; ZVE32F-NEXT: sd a6, 0(t2) +; ZVE32F-NEXT: sd t1, 80(t3) +; ZVE32F-NEXT: sd t0, 80(t2) +; ZVE32F-NEXT: addi a2, a2, 4 +; ZVE32F-NEXT: addi a3, a3, 4 +; ZVE32F-NEXT: addi a4, a4, -4 +; ZVE32F-NEXT: addi a1, a1, 32 +; ZVE32F-NEXT: bnez a4, .LBB11_1 +; ZVE32F-NEXT: # %bb.2: +; ZVE32F-NEXT: ret br label %3 3: ; preds = %3, %2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll index e5b48c1df3c8..7f34be932b42 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s +; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,V +; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+f,+zve32f -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,ZVE32F %struct.foo = type { i32, i32, i32, i32 } @@ -575,29 +576,54 @@ declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32 immar ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. define void @gather_of_pointers(i32** noalias nocapture %0, i32** noalias nocapture readonly %1) { ; -; CHECK-LABEL: @gather_of_pointers( -; CHECK-NEXT: br label [[TMP3:%.*]] -; CHECK: 3: -; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP13:%.*]], [[TMP3]] ] -; CHECK-NEXT: [[DOTSCALAR:%.*]] = phi i64 [ 0, [[TMP2]] ], [ [[DOTSCALAR1:%.*]], [[TMP3]] ] -; CHECK-NEXT: [[DOTSCALAR2:%.*]] = phi i64 [ 10, [[TMP2]] ], [ [[DOTSCALAR3:%.*]], [[TMP3]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32*, i32** [[TMP1:%.*]], i64 [[DOTSCALAR]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32*, i32** [[TMP1]], i64 [[DOTSCALAR2]] -; CHECK-NEXT: [[TMP7:%.*]] = call <2 x i32*> @llvm.riscv.masked.strided.load.v2p0i32.p0p0i32.i64(<2 x i32*> undef, i32** [[TMP5]], i64 40, <2 x i1> ) -; CHECK-NEXT: [[TMP8:%.*]] = call <2 x i32*> @llvm.riscv.masked.strided.load.v2p0i32.p0p0i32.i64(<2 x i32*> undef, i32** [[TMP6]], i64 40, <2 x i1> ) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32*, i32** [[TMP0:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32** [[TMP9]] to <2 x i32*>* -; CHECK-NEXT: store <2 x i32*> [[TMP7]], <2 x i32*>* [[TMP10]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32*, i32** [[TMP9]], i64 2 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32** [[TMP11]] to <2 x i32*>* -; CHECK-NEXT: store <2 x i32*> [[TMP8]], <2 x i32*>* [[TMP12]], align 8 -; CHECK-NEXT: [[TMP13]] = add nuw i64 [[TMP4]], 4 -; CHECK-NEXT: [[DOTSCALAR1]] = add i64 [[DOTSCALAR]], 20 -; CHECK-NEXT: [[DOTSCALAR3]] = add i64 [[DOTSCALAR2]], 20 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[TMP15:%.*]], label [[TMP3]] -; CHECK: 15: -; CHECK-NEXT: ret void +; V-LABEL: @gather_of_pointers( +; V-NEXT: br label [[TMP3:%.*]] +; V: 3: +; V-NEXT: [[TMP4:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP13:%.*]], [[TMP3]] ] +; V-NEXT: [[DOTSCALAR:%.*]] = phi i64 [ 0, [[TMP2]] ], [ [[DOTSCALAR1:%.*]], [[TMP3]] ] +; V-NEXT: [[DOTSCALAR2:%.*]] = phi i64 [ 10, [[TMP2]] ], [ [[DOTSCALAR3:%.*]], [[TMP3]] ] +; V-NEXT: [[TMP5:%.*]] = getelementptr i32*, i32** [[TMP1:%.*]], i64 [[DOTSCALAR]] +; V-NEXT: [[TMP6:%.*]] = getelementptr i32*, i32** [[TMP1]], i64 [[DOTSCALAR2]] +; V-NEXT: [[TMP7:%.*]] = call <2 x i32*> @llvm.riscv.masked.strided.load.v2p0i32.p0p0i32.i64(<2 x i32*> undef, i32** [[TMP5]], i64 40, <2 x i1> ) +; V-NEXT: [[TMP8:%.*]] = call <2 x i32*> @llvm.riscv.masked.strided.load.v2p0i32.p0p0i32.i64(<2 x i32*> undef, i32** [[TMP6]], i64 40, <2 x i1> ) +; V-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32*, i32** [[TMP0:%.*]], i64 [[TMP4]] +; V-NEXT: [[TMP10:%.*]] = bitcast i32** [[TMP9]] to <2 x i32*>* +; V-NEXT: store <2 x i32*> [[TMP7]], <2 x i32*>* [[TMP10]], align 8 +; V-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32*, i32** [[TMP9]], i64 2 +; V-NEXT: [[TMP12:%.*]] = bitcast i32** [[TMP11]] to <2 x i32*>* +; V-NEXT: store <2 x i32*> [[TMP8]], <2 x i32*>* [[TMP12]], align 8 +; V-NEXT: [[TMP13]] = add nuw i64 [[TMP4]], 4 +; V-NEXT: [[DOTSCALAR1]] = add i64 [[DOTSCALAR]], 20 +; V-NEXT: [[DOTSCALAR3]] = add i64 [[DOTSCALAR2]], 20 +; V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 1024 +; V-NEXT: br i1 [[TMP14]], label [[TMP15:%.*]], label [[TMP3]] +; V: 15: +; V-NEXT: ret void +; +; ZVE32F-LABEL: @gather_of_pointers( +; ZVE32F-NEXT: br label [[TMP3:%.*]] +; ZVE32F: 3: +; ZVE32F-NEXT: [[TMP4:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP17:%.*]], [[TMP3]] ] +; ZVE32F-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ , [[TMP2]] ], [ [[TMP18:%.*]], [[TMP3]] ] +; ZVE32F-NEXT: [[TMP6:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], +; ZVE32F-NEXT: [[TMP7:%.*]] = mul <2 x i64> [[TMP5]], +; ZVE32F-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP7]], +; ZVE32F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32*, i32** [[TMP1:%.*]], <2 x i64> [[TMP6]] +; ZVE32F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32*, i32** [[TMP1]], <2 x i64> [[TMP8]] +; ZVE32F-NEXT: [[TMP11:%.*]] = call <2 x i32*> @llvm.masked.gather.v2p0i32.v2p0p0i32(<2 x i32**> [[TMP9]], i32 8, <2 x i1> , <2 x i32*> undef) +; ZVE32F-NEXT: [[TMP12:%.*]] = call <2 x i32*> @llvm.masked.gather.v2p0i32.v2p0p0i32(<2 x i32**> [[TMP10]], i32 8, <2 x i1> , <2 x i32*> undef) +; ZVE32F-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32*, i32** [[TMP0:%.*]], i64 [[TMP4]] +; ZVE32F-NEXT: [[TMP14:%.*]] = bitcast i32** [[TMP13]] to <2 x i32*>* +; ZVE32F-NEXT: store <2 x i32*> [[TMP11]], <2 x i32*>* [[TMP14]], align 8 +; ZVE32F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32*, i32** [[TMP13]], i64 2 +; ZVE32F-NEXT: [[TMP16:%.*]] = bitcast i32** [[TMP15]] to <2 x i32*>* +; ZVE32F-NEXT: store <2 x i32*> [[TMP12]], <2 x i32*>* [[TMP16]], align 8 +; ZVE32F-NEXT: [[TMP17]] = add nuw i64 [[TMP4]], 4 +; ZVE32F-NEXT: [[TMP18]] = add <2 x i64> [[TMP5]], +; ZVE32F-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], 1024 +; ZVE32F-NEXT: br i1 [[TMP19]], label [[TMP20:%.*]], label [[TMP3]] +; ZVE32F: 20: +; ZVE32F-NEXT: ret void ; br label %3 @@ -631,29 +657,54 @@ declare <2 x i32*> @llvm.masked.gather.v2p0i32.v2p0p0i32(<2 x i32**>, i32 immarg ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. define void @scatter_of_pointers(i32** noalias nocapture %0, i32** noalias nocapture readonly %1) { ; -; CHECK-LABEL: @scatter_of_pointers( -; CHECK-NEXT: br label [[TMP3:%.*]] -; CHECK: 3: -; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP13:%.*]], [[TMP3]] ] -; CHECK-NEXT: [[DOTSCALAR:%.*]] = phi i64 [ 0, [[TMP2]] ], [ [[DOTSCALAR1:%.*]], [[TMP3]] ] -; CHECK-NEXT: [[DOTSCALAR2:%.*]] = phi i64 [ 10, [[TMP2]] ], [ [[DOTSCALAR3:%.*]], [[TMP3]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32*, i32** [[TMP1:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32** [[TMP5]] to <2 x i32*>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32*, i32** [[TMP5]], i64 2 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32** [[TMP8]] to <2 x i32*>* -; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP9]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32*, i32** [[TMP0:%.*]], i64 [[DOTSCALAR]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32*, i32** [[TMP0]], i64 [[DOTSCALAR2]] -; CHECK-NEXT: call void @llvm.riscv.masked.strided.store.v2p0i32.p0p0i32.i64(<2 x i32*> [[TMP7]], i32** [[TMP11]], i64 40, <2 x i1> ) -; CHECK-NEXT: call void @llvm.riscv.masked.strided.store.v2p0i32.p0p0i32.i64(<2 x i32*> [[TMP10]], i32** [[TMP12]], i64 40, <2 x i1> ) -; CHECK-NEXT: [[TMP13]] = add nuw i64 [[TMP4]], 4 -; CHECK-NEXT: [[DOTSCALAR1]] = add i64 [[DOTSCALAR]], 20 -; CHECK-NEXT: [[DOTSCALAR3]] = add i64 [[DOTSCALAR2]], 20 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[TMP15:%.*]], label [[TMP3]] -; CHECK: 15: -; CHECK-NEXT: ret void +; V-LABEL: @scatter_of_pointers( +; V-NEXT: br label [[TMP3:%.*]] +; V: 3: +; V-NEXT: [[TMP4:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP13:%.*]], [[TMP3]] ] +; V-NEXT: [[DOTSCALAR:%.*]] = phi i64 [ 0, [[TMP2]] ], [ [[DOTSCALAR1:%.*]], [[TMP3]] ] +; V-NEXT: [[DOTSCALAR2:%.*]] = phi i64 [ 10, [[TMP2]] ], [ [[DOTSCALAR3:%.*]], [[TMP3]] ] +; V-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32*, i32** [[TMP1:%.*]], i64 [[TMP4]] +; V-NEXT: [[TMP6:%.*]] = bitcast i32** [[TMP5]] to <2 x i32*>* +; V-NEXT: [[TMP7:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP6]], align 8 +; V-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32*, i32** [[TMP5]], i64 2 +; V-NEXT: [[TMP9:%.*]] = bitcast i32** [[TMP8]] to <2 x i32*>* +; V-NEXT: [[TMP10:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP9]], align 8 +; V-NEXT: [[TMP11:%.*]] = getelementptr i32*, i32** [[TMP0:%.*]], i64 [[DOTSCALAR]] +; V-NEXT: [[TMP12:%.*]] = getelementptr i32*, i32** [[TMP0]], i64 [[DOTSCALAR2]] +; V-NEXT: call void @llvm.riscv.masked.strided.store.v2p0i32.p0p0i32.i64(<2 x i32*> [[TMP7]], i32** [[TMP11]], i64 40, <2 x i1> ) +; V-NEXT: call void @llvm.riscv.masked.strided.store.v2p0i32.p0p0i32.i64(<2 x i32*> [[TMP10]], i32** [[TMP12]], i64 40, <2 x i1> ) +; V-NEXT: [[TMP13]] = add nuw i64 [[TMP4]], 4 +; V-NEXT: [[DOTSCALAR1]] = add i64 [[DOTSCALAR]], 20 +; V-NEXT: [[DOTSCALAR3]] = add i64 [[DOTSCALAR2]], 20 +; V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 1024 +; V-NEXT: br i1 [[TMP14]], label [[TMP15:%.*]], label [[TMP3]] +; V: 15: +; V-NEXT: ret void +; +; ZVE32F-LABEL: @scatter_of_pointers( +; ZVE32F-NEXT: br label [[TMP3:%.*]] +; ZVE32F: 3: +; ZVE32F-NEXT: [[TMP4:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP17:%.*]], [[TMP3]] ] +; ZVE32F-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ , [[TMP2]] ], [ [[TMP18:%.*]], [[TMP3]] ] +; ZVE32F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32*, i32** [[TMP1:%.*]], i64 [[TMP4]] +; ZVE32F-NEXT: [[TMP7:%.*]] = bitcast i32** [[TMP6]] to <2 x i32*>* +; ZVE32F-NEXT: [[TMP8:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP7]], align 8 +; ZVE32F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32*, i32** [[TMP6]], i64 2 +; ZVE32F-NEXT: [[TMP10:%.*]] = bitcast i32** [[TMP9]] to <2 x i32*>* +; ZVE32F-NEXT: [[TMP11:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP10]], align 8 +; ZVE32F-NEXT: [[TMP12:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], +; ZVE32F-NEXT: [[TMP13:%.*]] = mul <2 x i64> [[TMP5]], +; ZVE32F-NEXT: [[TMP14:%.*]] = add <2 x i64> [[TMP13]], +; ZVE32F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32*, i32** [[TMP0:%.*]], <2 x i64> [[TMP12]] +; ZVE32F-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32*, i32** [[TMP0]], <2 x i64> [[TMP14]] +; ZVE32F-NEXT: call void @llvm.masked.scatter.v2p0i32.v2p0p0i32(<2 x i32*> [[TMP8]], <2 x i32**> [[TMP15]], i32 8, <2 x i1> ) +; ZVE32F-NEXT: call void @llvm.masked.scatter.v2p0i32.v2p0p0i32(<2 x i32*> [[TMP11]], <2 x i32**> [[TMP16]], i32 8, <2 x i1> ) +; ZVE32F-NEXT: [[TMP17]] = add nuw i64 [[TMP4]], 4 +; ZVE32F-NEXT: [[TMP18]] = add <2 x i64> [[TMP5]], +; ZVE32F-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], 1024 +; ZVE32F-NEXT: br i1 [[TMP19]], label [[TMP20:%.*]], label [[TMP3]] +; ZVE32F: 20: +; ZVE32F-NEXT: ret void ; br label %3 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 35cb5f7a1648..65ea01fd9ebb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -1,25 +1,51 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+experimental-zvfh,+v -target-abi=ilp32d \ -; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32,RV32V ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+experimental-zvfh,+v -target-abi=lp64d \ -; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64V +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+experimental-zvfh,+zve32f -target-abi=ilp32d \ +; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32,RV32ZVE32F +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+experimental-zvfh,+zve32f -target-abi=lp64d \ +; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64ZVE32F declare <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*>, i32, <1 x i1>, <1 x i8>) define <1 x i8> @mgather_v1i8(<1 x i8*> %ptrs, <1 x i1> %m, <1 x i8> %passthru) { -; RV32-LABEL: mgather_v1i8: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v1i8: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv1r.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v1i8: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v1i8: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v1i8: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vmv1r.v v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v1i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB0_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB0_2: # %else +; RV64ZVE32F-NEXT: ret %v = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> %ptrs, i32 1, <1 x i1> %m, <1 x i8> %passthru) ret <1 x i8> %v } @@ -27,144 +53,486 @@ define <1 x i8> @mgather_v1i8(<1 x i8*> %ptrs, <1 x i1> %m, <1 x i8> %passthru) declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>) define <2 x i8> @mgather_v2i8(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) { -; RV32-LABEL: mgather_v2i8: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i8: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv1r.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i8: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i8: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i8: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vmv1r.v v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB1_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB1_4 +; RV64ZVE32F-NEXT: .LBB1_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB1_3: # %cond.load +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB1_2 +; RV64ZVE32F-NEXT: .LBB1_4: # %cond.load1 +; RV64ZVE32F-NEXT: lb a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) ret <2 x i8> %v } define <2 x i16> @mgather_v2i8_sextload_v2i16(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) { -; RV32-LABEL: mgather_v2i8_sextload_v2i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; RV32-NEXT: vsext.vf2 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i8_sextload_v2i16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32V-NEXT: vsext.vf2 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i8_sextload_v2i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; RV64-NEXT: vsext.vf2 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i8_sextload_v2i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV64V-NEXT: vsext.vf2 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i8_sextload_v2i16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf2 v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB2_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB2_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB2_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lb a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB2_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vsext.vf2 v9, v8 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = sext <2 x i8> %v to <2 x i16> ret <2 x i16> %ev } define <2 x i16> @mgather_v2i8_zextload_v2i16(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) { -; RV32-LABEL: mgather_v2i8_zextload_v2i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; RV32-NEXT: vzext.vf2 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i8_zextload_v2i16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32V-NEXT: vzext.vf2 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i8_zextload_v2i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; RV64-NEXT: vzext.vf2 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i8_zextload_v2i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV64V-NEXT: vzext.vf2 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i8_zextload_v2i16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vzext.vf2 v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB3_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB3_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB3_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lb a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB3_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vzext.vf2 v9, v8 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = zext <2 x i8> %v to <2 x i16> ret <2 x i16> %ev } define <2 x i32> @mgather_v2i8_sextload_v2i32(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) { -; RV32-LABEL: mgather_v2i8_sextload_v2i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV32-NEXT: vsext.vf4 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i8_sextload_v2i32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32V-NEXT: vsext.vf4 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i8_sextload_v2i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV64-NEXT: vsext.vf4 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i8_sextload_v2i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64V-NEXT: vsext.vf4 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i8_sextload_v2i32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsext.vf4 v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB4_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB4_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB4_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lb a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB4_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsext.vf4 v9, v8 +; RV64ZVE32F-NEXT: vmv.v.v v8, v9 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = sext <2 x i8> %v to <2 x i32> ret <2 x i32> %ev } define <2 x i32> @mgather_v2i8_zextload_v2i32(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) { -; RV32-LABEL: mgather_v2i8_zextload_v2i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV32-NEXT: vzext.vf4 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i8_zextload_v2i32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32V-NEXT: vzext.vf4 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i8_zextload_v2i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV64-NEXT: vzext.vf4 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i8_zextload_v2i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64V-NEXT: vzext.vf4 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i8_zextload_v2i32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vzext.vf4 v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB5_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB5_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB5_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lb a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB5_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vzext.vf4 v9, v8 +; RV64ZVE32F-NEXT: vmv.v.v v8, v9 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = zext <2 x i8> %v to <2 x i32> ret <2 x i32> %ev } define <2 x i64> @mgather_v2i8_sextload_v2i64(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) { -; RV32-LABEL: mgather_v2i8_sextload_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vsext.vf8 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i8_sextload_v2i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32V-NEXT: vsext.vf8 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i8_sextload_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vsext.vf8 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i8_sextload_v2i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64V-NEXT: vsext.vf8 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i8_sextload_v2i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: srai a2, a1, 31 +; RV32ZVE32F-NEXT: vmv.x.s a3, v9 +; RV32ZVE32F-NEXT: srai a4, a3, 31 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 12(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB6_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB6_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB6_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lb a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB6_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = sext <2 x i8> %v to <2 x i64> ret <2 x i64> %ev } define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x i8*> %ptrs, <2 x i1> %m, <2 x i8> %passthru) { -; RV32-LABEL: mgather_v2i8_zextload_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vzext.vf8 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i8_zextload_v2i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32V-NEXT: vzext.vf8 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i8_zextload_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vzext.vf8 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i8_zextload_v2i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64V-NEXT: vzext.vf8 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i8_zextload_v2i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: andi a1, a1, 255 +; RV32ZVE32F-NEXT: vmv.x.s a2, v9 +; RV32ZVE32F-NEXT: andi a2, a2, 255 +; RV32ZVE32F-NEXT: sw zero, 12(a0) +; RV32ZVE32F-NEXT: sw zero, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 8(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB7_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB7_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB7_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lb a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB7_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: andi a1, a0, 255 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: andi a0, a0, 255 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) %ev = zext <2 x i8> %v to <2 x i64> ret <2 x i64> %ev @@ -180,12 +548,77 @@ define <4 x i8> @mgather_v4i8(<4 x i8*> %ptrs, <4 x i1> %m, <4 x i8> %passthru) ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_v4i8: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v4i8: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v4i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a1) +; RV64ZVE32F-NEXT: lbu a1, 15(sp) +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: bnez a2, .LBB8_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB8_6 +; RV64ZVE32F-NEXT: .LBB8_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB8_7 +; RV64ZVE32F-NEXT: .LBB8_3: # %else5 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB8_8 +; RV64ZVE32F-NEXT: .LBB8_4: # %else8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB8_5: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB8_2 +; RV64ZVE32F-NEXT: .LBB8_6: # %cond.load1 +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB8_3 +; RV64ZVE32F-NEXT: .LBB8_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB8_4 +; RV64ZVE32F-NEXT: .LBB8_8: # %cond.load7 +; RV64ZVE32F-NEXT: ld a0, 24(a0) +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> %m, <4 x i8> %passthru) ret <4 x i8> %v } @@ -198,12 +631,77 @@ define <4 x i8> @mgather_truemask_v4i8(<4 x i8*> %ptrs, <4 x i8> %passthru) { ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_truemask_v4i8: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8 -; RV64-NEXT: vmv1r.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_truemask_v4i8: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8 +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_truemask_v4i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 8(a0) +; RV64ZVE32F-NEXT: ld a4, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB9_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB9_6 +; RV64ZVE32F-NEXT: .LBB9_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB9_7 +; RV64ZVE32F-NEXT: .LBB9_3: # %else5 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB9_8 +; RV64ZVE32F-NEXT: .LBB9_4: # %else8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB9_5: # %cond.load +; RV64ZVE32F-NEXT: lb a4, 0(a4) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a4 +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB9_2 +; RV64ZVE32F-NEXT: .LBB9_6: # %cond.load1 +; RV64ZVE32F-NEXT: lb a3, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB9_3 +; RV64ZVE32F-NEXT: .LBB9_7: # %cond.load4 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB9_4 +; RV64ZVE32F-NEXT: .LBB9_8: # %cond.load7 +; RV64ZVE32F-NEXT: lb a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> %mtrue, <4 x i8> %passthru) @@ -216,10 +714,14 @@ define <4 x i8> @mgather_falsemask_v4i8(<4 x i8*> %ptrs, <4 x i8> %passthru) { ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_falsemask_v4i8: -; RV64: # %bb.0: -; RV64-NEXT: vmv1r.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_falsemask_v4i8: +; RV64V: # %bb.0: +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_falsemask_v4i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ret %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> zeroinitializer, <4 x i8> %passthru) ret <4 x i8> %v } @@ -234,12 +736,111 @@ define <8 x i8> @mgather_v8i8(<8 x i8*> %ptrs, <8 x i1> %m, <8 x i8> %passthru) ; RV32-NEXT: vmv1r.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_v8i8: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vluxei64.v v12, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v8i8: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v12, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v12 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v8i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: bnez a2, .LBB11_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB11_10 +; RV64ZVE32F-NEXT: .LBB11_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB11_11 +; RV64ZVE32F-NEXT: .LBB11_3: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB11_12 +; RV64ZVE32F-NEXT: .LBB11_4: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB11_13 +; RV64ZVE32F-NEXT: .LBB11_5: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB11_14 +; RV64ZVE32F-NEXT: .LBB11_6: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: bnez a2, .LBB11_15 +; RV64ZVE32F-NEXT: .LBB11_7: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB11_16 +; RV64ZVE32F-NEXT: .LBB11_8: # %else20 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB11_9: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB11_2 +; RV64ZVE32F-NEXT: .LBB11_10: # %cond.load1 +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB11_3 +; RV64ZVE32F-NEXT: .LBB11_11: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB11_4 +; RV64ZVE32F-NEXT: .LBB11_12: # %cond.load7 +; RV64ZVE32F-NEXT: ld a2, 24(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB11_5 +; RV64ZVE32F-NEXT: .LBB11_13: # %cond.load10 +; RV64ZVE32F-NEXT: ld a2, 32(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB11_6 +; RV64ZVE32F-NEXT: .LBB11_14: # %cond.load13 +; RV64ZVE32F-NEXT: ld a2, 40(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: beqz a2, .LBB11_7 +; RV64ZVE32F-NEXT: .LBB11_15: # %cond.load16 +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB11_8 +; RV64ZVE32F-NEXT: .LBB11_16: # %cond.load19 +; RV64ZVE32F-NEXT: ld a0, 56(a0) +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 +; RV64ZVE32F-NEXT: ret %v = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> %m, <8 x i8> %passthru) ret <8 x i8> %v } @@ -254,14 +855,125 @@ define <8 x i8> @mgather_baseidx_v8i8(i8* %base, <8 x i8> %idxs, <8 x i1> %m, <8 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i8: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v12, v8 -; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; RV64-NEXT: vluxei64.v v9, (a0), v12, v0.t -; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i8: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: .LBB12_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: .LBB12_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB12_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB12_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB12_14 +; RV64ZVE32F-NEXT: .LBB12_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_10 +; RV64ZVE32F-NEXT: .LBB12_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB12_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB12_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB12_16 +; RV64ZVE32F-NEXT: .LBB12_12: # %else20 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB12_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_8 +; RV64ZVE32F-NEXT: .LBB12_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB12_9 +; RV64ZVE32F-NEXT: j .LBB12_10 +; RV64ZVE32F-NEXT: .LBB12_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB12_12 +; RV64ZVE32F-NEXT: .LBB12_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i8, i8* %base, <8 x i8> %idxs %v = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> %m, <8 x i8> %passthru) ret <8 x i8> %v @@ -270,19 +982,41 @@ define <8 x i8> @mgather_baseidx_v8i8(i8* %base, <8 x i8> %idxs, <8 x i1> %m, <8 declare <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*>, i32, <1 x i1>, <1 x i16>) define <1 x i16> @mgather_v1i16(<1 x i16*> %ptrs, <1 x i1> %m, <1 x i16> %passthru) { -; RV32-LABEL: mgather_v1i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v1i16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv1r.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v1i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v1i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v1i16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vmv1r.v v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v1i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB13_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB13_2: # %else +; RV64ZVE32F-NEXT: ret %v = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> %ptrs, i32 2, <1 x i1> %m, <1 x i16> %passthru) ret <1 x i16> %v } @@ -290,102 +1024,357 @@ define <1 x i16> @mgather_v1i16(<1 x i16*> %ptrs, <1 x i1> %m, <1 x i16> %passth declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>) define <2 x i16> @mgather_v2i16(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) { -; RV32-LABEL: mgather_v2i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv1r.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vmv1r.v v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB14_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB14_4 +; RV64ZVE32F-NEXT: .LBB14_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB14_3: # %cond.load +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB14_2 +; RV64ZVE32F-NEXT: .LBB14_4: # %cond.load1 +; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) ret <2 x i16> %v } define <2 x i32> @mgather_v2i16_sextload_v2i32(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) { -; RV32-LABEL: mgather_v2i16_sextload_v2i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV32-NEXT: vsext.vf2 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i16_sextload_v2i32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32V-NEXT: vsext.vf2 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i16_sextload_v2i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV64-NEXT: vsext.vf2 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i16_sextload_v2i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64V-NEXT: vsext.vf2 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i16_sextload_v2i32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsext.vf2 v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB15_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB15_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB15_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB15_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vsext.vf2 v9, v8 +; RV64ZVE32F-NEXT: vmv.v.v v8, v9 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) %ev = sext <2 x i16> %v to <2 x i32> ret <2 x i32> %ev } define <2 x i32> @mgather_v2i16_zextload_v2i32(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) { -; RV32-LABEL: mgather_v2i16_zextload_v2i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV32-NEXT: vzext.vf2 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i16_zextload_v2i32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32V-NEXT: vzext.vf2 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i16_zextload_v2i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; RV64-NEXT: vzext.vf2 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i16_zextload_v2i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64V-NEXT: vzext.vf2 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i16_zextload_v2i32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vzext.vf2 v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i16_zextload_v2i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB16_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB16_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB16_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB16_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vzext.vf2 v9, v8 +; RV64ZVE32F-NEXT: vmv.v.v v8, v9 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) %ev = zext <2 x i16> %v to <2 x i32> ret <2 x i32> %ev } define <2 x i64> @mgather_v2i16_sextload_v2i64(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) { -; RV32-LABEL: mgather_v2i16_sextload_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vsext.vf4 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i16_sextload_v2i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32V-NEXT: vsext.vf4 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i16_sextload_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vsext.vf4 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i16_sextload_v2i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64V-NEXT: vsext.vf4 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i16_sextload_v2i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: srai a2, a1, 31 +; RV32ZVE32F-NEXT: vmv.x.s a3, v9 +; RV32ZVE32F-NEXT: srai a4, a3, 31 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 12(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB17_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB17_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB17_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB17_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) %ev = sext <2 x i16> %v to <2 x i64> ret <2 x i64> %ev } define <2 x i64> @mgather_v2i16_zextload_v2i64(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) { -; RV32-LABEL: mgather_v2i16_zextload_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vzext.vf4 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i16_zextload_v2i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32V-NEXT: vzext.vf4 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i16_zextload_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vzext.vf4 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i16_zextload_v2i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64V-NEXT: vzext.vf4 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i16_zextload_v2i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: lui a2, 16 +; RV32ZVE32F-NEXT: addi a2, a2, -1 +; RV32ZVE32F-NEXT: and a1, a1, a2 +; RV32ZVE32F-NEXT: vmv.x.s a3, v9 +; RV32ZVE32F-NEXT: and a2, a3, a2 +; RV32ZVE32F-NEXT: sw zero, 12(a0) +; RV32ZVE32F-NEXT: sw zero, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 8(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i16_zextload_v2i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB18_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB18_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB18_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB18_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: lui a1, 16 +; RV64ZVE32F-NEXT: addiw a1, a1, -1 +; RV64ZVE32F-NEXT: and a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: and a1, a2, a1 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) %ev = zext <2 x i16> %v to <2 x i64> ret <2 x i64> %ev @@ -401,12 +1390,77 @@ define <4 x i16> @mgather_v4i16(<4 x i16*> %ptrs, <4 x i1> %m, <4 x i16> %passth ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_v4i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v4i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v4i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a1) +; RV64ZVE32F-NEXT: lbu a1, 15(sp) +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: bnez a2, .LBB19_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB19_6 +; RV64ZVE32F-NEXT: .LBB19_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB19_7 +; RV64ZVE32F-NEXT: .LBB19_3: # %else5 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB19_8 +; RV64ZVE32F-NEXT: .LBB19_4: # %else8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB19_5: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_2 +; RV64ZVE32F-NEXT: .LBB19_6: # %cond.load1 +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_3 +; RV64ZVE32F-NEXT: .LBB19_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB19_4 +; RV64ZVE32F-NEXT: .LBB19_8: # %cond.load7 +; RV64ZVE32F-NEXT: ld a0, 24(a0) +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> %m, <4 x i16> %passthru) ret <4 x i16> %v } @@ -419,12 +1473,77 @@ define <4 x i16> @mgather_truemask_v4i16(<4 x i16*> %ptrs, <4 x i16> %passthru) ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_truemask_v4i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8 -; RV64-NEXT: vmv1r.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_truemask_v4i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8 +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_truemask_v4i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 8(a0) +; RV64ZVE32F-NEXT: ld a4, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB20_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB20_6 +; RV64ZVE32F-NEXT: .LBB20_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB20_7 +; RV64ZVE32F-NEXT: .LBB20_3: # %else5 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB20_8 +; RV64ZVE32F-NEXT: .LBB20_4: # %else8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB20_5: # %cond.load +; RV64ZVE32F-NEXT: lh a4, 0(a4) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a4 +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB20_2 +; RV64ZVE32F-NEXT: .LBB20_6: # %cond.load1 +; RV64ZVE32F-NEXT: lh a3, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB20_3 +; RV64ZVE32F-NEXT: .LBB20_7: # %cond.load4 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB20_4 +; RV64ZVE32F-NEXT: .LBB20_8: # %cond.load7 +; RV64ZVE32F-NEXT: lh a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> %mtrue, <4 x i16> %passthru) @@ -437,10 +1556,14 @@ define <4 x i16> @mgather_falsemask_v4i16(<4 x i16*> %ptrs, <4 x i16> %passthru) ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_falsemask_v4i16: -; RV64: # %bb.0: -; RV64-NEXT: vmv1r.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_falsemask_v4i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_falsemask_v4i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ret %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> zeroinitializer, <4 x i16> %passthru) ret <4 x i16> %v } @@ -455,12 +1578,111 @@ define <8 x i16> @mgather_v8i16(<8 x i16*> %ptrs, <8 x i1> %m, <8 x i16> %passth ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_v8i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64-NEXT: vluxei64.v v12, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v8i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64V-NEXT: vluxei64.v v12, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v8i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: bnez a2, .LBB22_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB22_10 +; RV64ZVE32F-NEXT: .LBB22_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB22_11 +; RV64ZVE32F-NEXT: .LBB22_3: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB22_12 +; RV64ZVE32F-NEXT: .LBB22_4: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB22_13 +; RV64ZVE32F-NEXT: .LBB22_5: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB22_14 +; RV64ZVE32F-NEXT: .LBB22_6: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: bnez a2, .LBB22_15 +; RV64ZVE32F-NEXT: .LBB22_7: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB22_16 +; RV64ZVE32F-NEXT: .LBB22_8: # %else20 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB22_9: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB22_2 +; RV64ZVE32F-NEXT: .LBB22_10: # %cond.load1 +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB22_3 +; RV64ZVE32F-NEXT: .LBB22_11: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB22_4 +; RV64ZVE32F-NEXT: .LBB22_12: # %cond.load7 +; RV64ZVE32F-NEXT: ld a2, 24(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB22_5 +; RV64ZVE32F-NEXT: .LBB22_13: # %cond.load10 +; RV64ZVE32F-NEXT: ld a2, 32(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB22_6 +; RV64ZVE32F-NEXT: .LBB22_14: # %cond.load13 +; RV64ZVE32F-NEXT: ld a2, 40(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: beqz a2, .LBB22_7 +; RV64ZVE32F-NEXT: .LBB22_15: # %cond.load16 +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB22_8 +; RV64ZVE32F-NEXT: .LBB22_16: # %cond.load19 +; RV64ZVE32F-NEXT: ld a0, 56(a0) +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 +; RV64ZVE32F-NEXT: ret %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru) ret <8 x i16> %v } @@ -476,15 +1698,140 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(i16* %base, <8 x i8> %idxs, <8 x i1 ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i8_v8i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v12, v8 -; RV64-NEXT: vadd.vv v12, v12, v12 -; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i8_v8i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vadd.vv v12, v12, v12 +; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i8_v8i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: .LBB23_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: .LBB23_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB23_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB23_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB23_14 +; RV64ZVE32F-NEXT: .LBB23_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_10 +; RV64ZVE32F-NEXT: .LBB23_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB23_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB23_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB23_16 +; RV64ZVE32F-NEXT: .LBB23_12: # %else20 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB23_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_8 +; RV64ZVE32F-NEXT: .LBB23_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB23_9 +; RV64ZVE32F-NEXT: j .LBB23_10 +; RV64ZVE32F-NEXT: .LBB23_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB23_12 +; RV64ZVE32F-NEXT: .LBB23_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, i16* %base, <8 x i8> %idxs %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru) ret <8 x i16> %v @@ -501,15 +1848,140 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(i16* %base, <8 x i8> %idxs, <8 ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i8_v8i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v12, v8 -; RV64-NEXT: vadd.vv v12, v12, v12 -; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i8_v8i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vadd.vv v12, v12, v12 +; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: .LBB24_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: .LBB24_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB24_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB24_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB24_14 +; RV64ZVE32F-NEXT: .LBB24_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_10 +; RV64ZVE32F-NEXT: .LBB24_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB24_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB24_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB24_16 +; RV64ZVE32F-NEXT: .LBB24_12: # %else20 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB24_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_8 +; RV64ZVE32F-NEXT: .LBB24_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB24_9 +; RV64ZVE32F-NEXT: j .LBB24_10 +; RV64ZVE32F-NEXT: .LBB24_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB24_12 +; RV64ZVE32F-NEXT: .LBB24_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %eidxs %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru) @@ -527,15 +1999,148 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(i16* %base, <8 x i8> %idxs, <8 ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i8_v8i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf8 v12, v8 -; RV64-NEXT: vadd.vv v12, v12, v12 -; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf8 v12, v8 +; RV64V-NEXT: vadd.vv v12, v12, v12 +; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i8_v8i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: .LBB25_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: .LBB25_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB25_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB25_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB25_14 +; RV64ZVE32F-NEXT: .LBB25_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_10 +; RV64ZVE32F-NEXT: .LBB25_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB25_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB25_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB25_16 +; RV64ZVE32F-NEXT: .LBB25_12: # %else20 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB25_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_8 +; RV64ZVE32F-NEXT: .LBB25_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB25_9 +; RV64ZVE32F-NEXT: j .LBB25_10 +; RV64ZVE32F-NEXT: .LBB25_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB25_12 +; RV64ZVE32F-NEXT: .LBB25_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: andi a1, a1, 255 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %eidxs %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru) @@ -553,15 +2158,135 @@ define <8 x i16> @mgather_baseidx_v8i16(i16* %base, <8 x i16> %idxs, <8 x i1> %m ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf4 v12, v8 -; RV64-NEXT: vadd.vv v12, v12, v12 -; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf4 v12, v8 +; RV64V-NEXT: vadd.vv v12, v12, v12 +; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: .LBB26_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: .LBB26_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB26_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB26_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB26_14 +; RV64ZVE32F-NEXT: .LBB26_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_10 +; RV64ZVE32F-NEXT: .LBB26_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB26_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB26_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB26_16 +; RV64ZVE32F-NEXT: .LBB26_12: # %else20 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB26_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_8 +; RV64ZVE32F-NEXT: .LBB26_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB26_9 +; RV64ZVE32F-NEXT: j .LBB26_10 +; RV64ZVE32F-NEXT: .LBB26_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB26_12 +; RV64ZVE32F-NEXT: .LBB26_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lh a0, 0(a0) +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %idxs %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru) ret <8 x i16> %v @@ -570,19 +2295,41 @@ define <8 x i16> @mgather_baseidx_v8i16(i16* %base, <8 x i16> %idxs, <8 x i1> %m declare <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*>, i32, <1 x i1>, <1 x i32>) define <1 x i32> @mgather_v1i32(<1 x i32*> %ptrs, <1 x i1> %m, <1 x i32> %passthru) { -; RV32-LABEL: mgather_v1i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v1i32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv1r.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v1i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v1i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v1i32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vmv.v.v v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v1i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB27_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB27_2: # %else +; RV64ZVE32F-NEXT: ret %v = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptrs, i32 4, <1 x i1> %m, <1 x i32> %passthru) ret <1 x i32> %v } @@ -590,60 +2337,218 @@ define <1 x i32> @mgather_v1i32(<1 x i32*> %ptrs, <1 x i1> %m, <1 x i32> %passth declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) define <2 x i32> @mgather_v2i32(<2 x i32*> %ptrs, <2 x i1> %m, <2 x i32> %passthru) { -; RV32-LABEL: mgather_v2i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv1r.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vmv.v.v v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB28_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB28_4 +; RV64ZVE32F-NEXT: .LBB28_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB28_3: # %cond.load +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB28_2 +; RV64ZVE32F-NEXT: .LBB28_4: # %cond.load1 +; RV64ZVE32F-NEXT: lw a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru) ret <2 x i32> %v } define <2 x i64> @mgather_v2i32_sextload_v2i64(<2 x i32*> %ptrs, <2 x i1> %m, <2 x i32> %passthru) { -; RV32-LABEL: mgather_v2i32_sextload_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vsext.vf2 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i32_sextload_v2i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32V-NEXT: vsext.vf2 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i32_sextload_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vsext.vf2 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i32_sextload_v2i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64V-NEXT: vsext.vf2 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i32_sextload_v2i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: srai a1, a1, 31 +; RV32ZVE32F-NEXT: vmv.x.s a2, v9 +; RV32ZVE32F-NEXT: srai a2, a2, 31 +; RV32ZVE32F-NEXT: vse32.v v9, (a0) +; RV32ZVE32F-NEXT: addi a3, a0, 8 +; RV32ZVE32F-NEXT: vse32.v v8, (a3) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a1, 12(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i32_sextload_v2i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB29_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB29_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB29_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lw a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB29_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru) %ev = sext <2 x i32> %v to <2 x i64> ret <2 x i64> %ev } define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x i32*> %ptrs, <2 x i1> %m, <2 x i32> %passthru) { -; RV32-LABEL: mgather_v2i32_zextload_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vzext.vf2 v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i32_zextload_v2i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32V-NEXT: vzext.vf2 v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i32_zextload_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vzext.vf2 v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i32_zextload_v2i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64V-NEXT: vzext.vf2 v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i32_zextload_v2i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV32ZVE32F-NEXT: sw zero, 12(a0) +; RV32ZVE32F-NEXT: sw zero, 4(a0) +; RV32ZVE32F-NEXT: vse32.v v9, (a0) +; RV32ZVE32F-NEXT: addi a0, a0, 8 +; RV32ZVE32F-NEXT: vse32.v v8, (a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i32_zextload_v2i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB30_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: .LBB30_2: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB30_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: lw a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: .LBB30_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 32 +; RV64ZVE32F-NEXT: srli a1, a0, 32 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 32 +; RV64ZVE32F-NEXT: srli a0, a0, 32 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru) %ev = zext <2 x i32> %v to <2 x i64> ret <2 x i64> %ev @@ -659,12 +2564,77 @@ define <4 x i32> @mgather_v4i32(<4 x i32*> %ptrs, <4 x i1> %m, <4 x i32> %passth ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_v4i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v4i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v4i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a1) +; RV64ZVE32F-NEXT: lbu a1, 15(sp) +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_6 +; RV64ZVE32F-NEXT: .LBB31_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_7 +; RV64ZVE32F-NEXT: .LBB31_3: # %else5 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB31_8 +; RV64ZVE32F-NEXT: .LBB31_4: # %else8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB31_5: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_2 +; RV64ZVE32F-NEXT: .LBB31_6: # %cond.load1 +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_3 +; RV64ZVE32F-NEXT: .LBB31_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB31_4 +; RV64ZVE32F-NEXT: .LBB31_8: # %cond.load7 +; RV64ZVE32F-NEXT: ld a0, 24(a0) +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %m, <4 x i32> %passthru) ret <4 x i32> %v } @@ -676,12 +2646,77 @@ define <4 x i32> @mgather_truemask_v4i32(<4 x i32*> %ptrs, <4 x i32> %passthru) ; RV32-NEXT: vluxei32.v v8, (zero), v8 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_truemask_v4i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8 -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_truemask_v4i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8 +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_truemask_v4i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 8(a0) +; RV64ZVE32F-NEXT: ld a4, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB32_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB32_6 +; RV64ZVE32F-NEXT: .LBB32_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB32_7 +; RV64ZVE32F-NEXT: .LBB32_3: # %else5 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB32_8 +; RV64ZVE32F-NEXT: .LBB32_4: # %else8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB32_5: # %cond.load +; RV64ZVE32F-NEXT: lw a4, 0(a4) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a4 +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB32_2 +; RV64ZVE32F-NEXT: .LBB32_6: # %cond.load1 +; RV64ZVE32F-NEXT: lw a3, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB32_3 +; RV64ZVE32F-NEXT: .LBB32_7: # %cond.load4 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB32_4 +; RV64ZVE32F-NEXT: .LBB32_8: # %cond.load7 +; RV64ZVE32F-NEXT: lw a0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mtrue, <4 x i32> %passthru) @@ -694,10 +2729,14 @@ define <4 x i32> @mgather_falsemask_v4i32(<4 x i32*> %ptrs, <4 x i32> %passthru) ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_falsemask_v4i32: -; RV64: # %bb.0: -; RV64-NEXT: vmv1r.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_falsemask_v4i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_falsemask_v4i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ret %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> zeroinitializer, <4 x i32> %passthru) ret <4 x i32> %v } @@ -712,12 +2751,111 @@ define <8 x i32> @mgather_v8i32(<8 x i32*> %ptrs, <8 x i1> %m, <8 x i32> %passth ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_v8i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v12, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v8i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v12, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: bnez a2, .LBB34_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB34_10 +; RV64ZVE32F-NEXT: .LBB34_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB34_11 +; RV64ZVE32F-NEXT: .LBB34_3: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB34_12 +; RV64ZVE32F-NEXT: .LBB34_4: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB34_13 +; RV64ZVE32F-NEXT: .LBB34_5: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB34_14 +; RV64ZVE32F-NEXT: .LBB34_6: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: bnez a2, .LBB34_15 +; RV64ZVE32F-NEXT: .LBB34_7: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB34_16 +; RV64ZVE32F-NEXT: .LBB34_8: # %else20 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB34_9: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_2 +; RV64ZVE32F-NEXT: .LBB34_10: # %cond.load1 +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_3 +; RV64ZVE32F-NEXT: .LBB34_11: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_4 +; RV64ZVE32F-NEXT: .LBB34_12: # %cond.load7 +; RV64ZVE32F-NEXT: ld a2, 24(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_5 +; RV64ZVE32F-NEXT: .LBB34_13: # %cond.load10 +; RV64ZVE32F-NEXT: ld a2, 32(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_6 +; RV64ZVE32F-NEXT: .LBB34_14: # %cond.load13 +; RV64ZVE32F-NEXT: ld a2, 40(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 5 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_7 +; RV64ZVE32F-NEXT: .LBB34_15: # %cond.load16 +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB34_8 +; RV64ZVE32F-NEXT: .LBB34_16: # %cond.load19 +; RV64ZVE32F-NEXT: ld a0, 56(a0) +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 7 +; RV64ZVE32F-NEXT: ret %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) ret <8 x i32> %v } @@ -732,15 +2870,140 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(i32* %base, <8 x i8> %idxs, <8 x i1 ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i8_v8i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i8_v8i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i8_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: .LBB35_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB35_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB35_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_14 +; RV64ZVE32F-NEXT: .LBB35_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_10 +; RV64ZVE32F-NEXT: .LBB35_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB35_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB35_16 +; RV64ZVE32F-NEXT: .LBB35_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB35_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_8 +; RV64ZVE32F-NEXT: .LBB35_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_9 +; RV64ZVE32F-NEXT: j .LBB35_10 +; RV64ZVE32F-NEXT: .LBB35_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB35_12 +; RV64ZVE32F-NEXT: .LBB35_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i32, i32* %base, <8 x i8> %idxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) ret <8 x i32> %v @@ -756,15 +3019,140 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(i32* %base, <8 x i8> %idxs, <8 ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i8_v8i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i8_v8i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: .LBB36_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB36_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB36_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB36_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB36_14 +; RV64ZVE32F-NEXT: .LBB36_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_10 +; RV64ZVE32F-NEXT: .LBB36_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB36_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB36_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB36_16 +; RV64ZVE32F-NEXT: .LBB36_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB36_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_8 +; RV64ZVE32F-NEXT: .LBB36_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB36_9 +; RV64ZVE32F-NEXT: j .LBB36_10 +; RV64ZVE32F-NEXT: .LBB36_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB36_12 +; RV64ZVE32F-NEXT: .LBB36_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) @@ -781,15 +3169,148 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(i32* %base, <8 x i8> %idxs, <8 ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i8_v8i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf8 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf8 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i8_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: .LBB37_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB37_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB37_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB37_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB37_14 +; RV64ZVE32F-NEXT: .LBB37_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_10 +; RV64ZVE32F-NEXT: .LBB37_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB37_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB37_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB37_16 +; RV64ZVE32F-NEXT: .LBB37_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB37_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_8 +; RV64ZVE32F-NEXT: .LBB37_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB37_9 +; RV64ZVE32F-NEXT: j .LBB37_10 +; RV64ZVE32F-NEXT: .LBB37_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB37_12 +; RV64ZVE32F-NEXT: .LBB37_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: andi a1, a1, 255 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) @@ -806,15 +3327,141 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(i32* %base, <8 x i16> %idxs, <8 x ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i16_v8i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf4 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i16_v8i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf4 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i16_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: .LBB38_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB38_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB38_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB38_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB38_14 +; RV64ZVE32F-NEXT: .LBB38_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_10 +; RV64ZVE32F-NEXT: .LBB38_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB38_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB38_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB38_16 +; RV64ZVE32F-NEXT: .LBB38_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB38_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_8 +; RV64ZVE32F-NEXT: .LBB38_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB38_9 +; RV64ZVE32F-NEXT: j .LBB38_10 +; RV64ZVE32F-NEXT: .LBB38_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB38_12 +; RV64ZVE32F-NEXT: .LBB38_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i32, i32* %base, <8 x i16> %idxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) ret <8 x i32> %v @@ -830,15 +3477,141 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(i32* %base, <8 x i16> %idxs, ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i16_v8i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf4 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i16_v8i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf4 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i16_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: .LBB39_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB39_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB39_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB39_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB39_14 +; RV64ZVE32F-NEXT: .LBB39_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_10 +; RV64ZVE32F-NEXT: .LBB39_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB39_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB39_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB39_16 +; RV64ZVE32F-NEXT: .LBB39_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB39_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_8 +; RV64ZVE32F-NEXT: .LBB39_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB39_9 +; RV64ZVE32F-NEXT: j .LBB39_10 +; RV64ZVE32F-NEXT: .LBB39_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB39_12 +; RV64ZVE32F-NEXT: .LBB39_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) @@ -855,15 +3628,151 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(i32* %base, <8 x i16> %idxs, ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i16_v8i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf4 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf4 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: lui a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: addiw a1, a1, -1 +; RV64ZVE32F-NEXT: beqz a3, .LBB40_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lw a3, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a3 +; RV64ZVE32F-NEXT: .LBB40_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB40_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lw a3, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB40_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB40_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lw a3, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB40_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB40_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB40_14 +; RV64ZVE32F-NEXT: .LBB40_8: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB40_10 +; RV64ZVE32F-NEXT: .LBB40_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lw a3, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB40_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB40_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: bnez a2, .LBB40_16 +; RV64ZVE32F-NEXT: .LBB40_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB40_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lw a3, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB40_8 +; RV64ZVE32F-NEXT: .LBB40_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lw a3, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB40_9 +; RV64ZVE32F-NEXT: j .LBB40_10 +; RV64ZVE32F-NEXT: .LBB40_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lw a3, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB40_12 +; RV64ZVE32F-NEXT: .LBB40_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: and a1, a2, a1 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) @@ -879,15 +3788,137 @@ define <8 x i32> @mgather_baseidx_v8i32(i32* %base, <8 x i32> %idxs, <8 x i1> %m ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf2 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf2 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB41_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: .LBB41_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB41_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB41_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB41_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB41_13 +; RV64ZVE32F-NEXT: .LBB41_6: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB41_14 +; RV64ZVE32F-NEXT: .LBB41_7: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB41_9 +; RV64ZVE32F-NEXT: .LBB41_8: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB41_9: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB41_15 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB41_16 +; RV64ZVE32F-NEXT: .LBB41_11: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB41_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB41_6 +; RV64ZVE32F-NEXT: .LBB41_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB41_7 +; RV64ZVE32F-NEXT: .LBB41_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB41_8 +; RV64ZVE32F-NEXT: j .LBB41_9 +; RV64ZVE32F-NEXT: .LBB41_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB41_11 +; RV64ZVE32F-NEXT: .LBB41_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lw a0, 0(a0) +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %idxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) ret <8 x i32> %v @@ -896,19 +3927,49 @@ define <8 x i32> @mgather_baseidx_v8i32(i32* %base, <8 x i32> %idxs, <8 x i1> %m declare <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*>, i32, <1 x i1>, <1 x i64>) define <1 x i64> @mgather_v1i64(<1 x i64*> %ptrs, <1 x i1> %m, <1 x i64> %passthru) { -; RV32-LABEL: mgather_v1i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v1i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v1i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v1i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v1i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetvli a2, zero, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vmv.x.s a2, v9 +; RV32ZVE32F-NEXT: andi a2, a2, 1 +; RV32ZVE32F-NEXT: beqz a2, .LBB42_2 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: lw a1, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: .LBB42_2: # %else +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v1i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a2, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB42_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: ld a1, 0(a0) +; RV64ZVE32F-NEXT: .LBB42_2: # %else +; RV64ZVE32F-NEXT: mv a0, a1 +; RV64ZVE32F-NEXT: ret %v = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> %ptrs, i32 8, <1 x i1> %m, <1 x i64> %passthru) ret <1 x i64> %v } @@ -916,19 +3977,98 @@ define <1 x i64> @mgather_v1i64(<1 x i64*> %ptrs, <1 x i1> %m, <1 x i64> %passth declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) define <2 x i64> @mgather_v2i64(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64> %passthru) { -; RV32-LABEL: mgather_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi a2, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (a2) +; RV32ZVE32F-NEXT: lbu a4, 15(sp) +; RV32ZVE32F-NEXT: andi a2, a4, 1 +; RV32ZVE32F-NEXT: beqz a2, .LBB43_3 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a2, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, a4, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB43_4 +; RV32ZVE32F-NEXT: .LBB43_2: +; RV32ZVE32F-NEXT: lw a4, 12(a1) +; RV32ZVE32F-NEXT: lw a1, 8(a1) +; RV32ZVE32F-NEXT: j .LBB43_5 +; RV32ZVE32F-NEXT: .LBB43_3: +; RV32ZVE32F-NEXT: lw a2, 4(a1) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: andi a4, a4, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB43_2 +; RV32ZVE32F-NEXT: .LBB43_4: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: lw a4, 4(a1) +; RV32ZVE32F-NEXT: lw a1, 0(a1) +; RV32ZVE32F-NEXT: .LBB43_5: # %else2 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a1, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a4, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a4) +; RV64ZVE32F-NEXT: lbu a4, 15(sp) +; RV64ZVE32F-NEXT: andi a5, a4, 1 +; RV64ZVE32F-NEXT: beqz a5, .LBB43_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: .LBB43_2: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: ld a3, 0(a1) +; RV64ZVE32F-NEXT: .LBB43_4: # %else2 +; RV64ZVE32F-NEXT: mv a0, a2 +; RV64ZVE32F-NEXT: mv a1, a3 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %ptrs, i32 8, <2 x i1> %m, <2 x i64> %passthru) ret <2 x i64> %v } @@ -936,36 +4076,308 @@ define <2 x i64> @mgather_v2i64(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64> %passth declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>) define <4 x i64> @mgather_v4i64(<4 x i64*> %ptrs, <4 x i1> %m, <4 x i64> %passthru) { -; RV32-LABEL: mgather_v4i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vluxei32.v v10, (zero), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v10 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v4i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32V-NEXT: vluxei32.v v10, (zero), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v10 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v4i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v4i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v4i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi a2, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (a2) +; RV32ZVE32F-NEXT: lbu a6, 15(sp) +; RV32ZVE32F-NEXT: andi a2, a6, 1 +; RV32ZVE32F-NEXT: beqz a2, .LBB44_5 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a2, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, a6, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB44_6 +; RV32ZVE32F-NEXT: .LBB44_2: +; RV32ZVE32F-NEXT: lw a4, 12(a1) +; RV32ZVE32F-NEXT: lw a5, 8(a1) +; RV32ZVE32F-NEXT: andi a7, a6, 4 +; RV32ZVE32F-NEXT: bnez a7, .LBB44_7 +; RV32ZVE32F-NEXT: .LBB44_3: +; RV32ZVE32F-NEXT: lw a7, 20(a1) +; RV32ZVE32F-NEXT: lw t0, 16(a1) +; RV32ZVE32F-NEXT: andi a6, a6, 8 +; RV32ZVE32F-NEXT: bnez a6, .LBB44_8 +; RV32ZVE32F-NEXT: .LBB44_4: +; RV32ZVE32F-NEXT: lw a6, 28(a1) +; RV32ZVE32F-NEXT: lw a1, 24(a1) +; RV32ZVE32F-NEXT: j .LBB44_9 +; RV32ZVE32F-NEXT: .LBB44_5: +; RV32ZVE32F-NEXT: lw a2, 4(a1) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: andi a4, a6, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB44_2 +; RV32ZVE32F-NEXT: .LBB44_6: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v9 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi a7, a6, 4 +; RV32ZVE32F-NEXT: beqz a7, .LBB44_3 +; RV32ZVE32F-NEXT: .LBB44_7: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s t0, v9 +; RV32ZVE32F-NEXT: lw a7, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi a6, a6, 8 +; RV32ZVE32F-NEXT: beqz a6, .LBB44_4 +; RV32ZVE32F-NEXT: .LBB44_8: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: lw a6, 4(a1) +; RV32ZVE32F-NEXT: lw a1, 0(a1) +; RV32ZVE32F-NEXT: .LBB44_9: # %else8 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw t0, 16(a0) +; RV32ZVE32F-NEXT: sw a7, 20(a0) +; RV32ZVE32F-NEXT: sw a1, 24(a0) +; RV32ZVE32F-NEXT: sw a6, 28(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v4i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a3, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a3) +; RV64ZVE32F-NEXT: lbu a5, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a5, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB44_5 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: ld a3, 0(a1) +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB44_6 +; RV64ZVE32F-NEXT: .LBB44_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: bnez a6, .LBB44_7 +; RV64ZVE32F-NEXT: .LBB44_3: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: andi a5, a5, 8 +; RV64ZVE32F-NEXT: bnez a5, .LBB44_8 +; RV64ZVE32F-NEXT: .LBB44_4: +; RV64ZVE32F-NEXT: ld a1, 24(a2) +; RV64ZVE32F-NEXT: j .LBB44_9 +; RV64ZVE32F-NEXT: .LBB44_5: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB44_2 +; RV64ZVE32F-NEXT: .LBB44_6: # %cond.load1 +; RV64ZVE32F-NEXT: ld a4, 8(a1) +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: beqz a6, .LBB44_3 +; RV64ZVE32F-NEXT: .LBB44_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a6, 16(a1) +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: andi a5, a5, 8 +; RV64ZVE32F-NEXT: beqz a5, .LBB44_4 +; RV64ZVE32F-NEXT: .LBB44_8: # %cond.load7 +; RV64ZVE32F-NEXT: ld a1, 24(a1) +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB44_9: # %else8 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a1, 24(a0) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %ptrs, i32 8, <4 x i1> %m, <4 x i64> %passthru) ret <4 x i64> %v } define <4 x i64> @mgather_truemask_v4i64(<4 x i64*> %ptrs, <4 x i64> %passthru) { -; RV32-LABEL: mgather_truemask_v4i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vluxei32.v v10, (zero), v8 -; RV32-NEXT: vmv.v.v v8, v10 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_truemask_v4i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32V-NEXT: vluxei32.v v10, (zero), v8 +; RV32V-NEXT: vmv.v.v v8, v10 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_truemask_v4i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vluxei64.v v8, (zero), v8 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_truemask_v4i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64V-NEXT: vluxei64.v v8, (zero), v8 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_truemask_v4i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: lw a2, 28(a1) +; RV32ZVE32F-NEXT: lw a3, 24(a1) +; RV32ZVE32F-NEXT: lw a4, 20(a1) +; RV32ZVE32F-NEXT: lw a5, 16(a1) +; RV32ZVE32F-NEXT: lw a6, 12(a1) +; RV32ZVE32F-NEXT: lw t0, 8(a1) +; RV32ZVE32F-NEXT: lw a7, 4(a1) +; RV32ZVE32F-NEXT: lw a1, 0(a1) +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmset.m v0 +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi t1, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (t1) +; RV32ZVE32F-NEXT: lb t1, 15(sp) +; RV32ZVE32F-NEXT: beqz zero, .LBB45_6 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi t2, t1, 2 +; RV32ZVE32F-NEXT: bnez t2, .LBB45_7 +; RV32ZVE32F-NEXT: .LBB45_2: # %else2 +; RV32ZVE32F-NEXT: andi t2, t1, 4 +; RV32ZVE32F-NEXT: bnez t2, .LBB45_8 +; RV32ZVE32F-NEXT: .LBB45_3: # %else5 +; RV32ZVE32F-NEXT: andi t1, t1, 8 +; RV32ZVE32F-NEXT: beqz t1, .LBB45_5 +; RV32ZVE32F-NEXT: .LBB45_4: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a2, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: .LBB45_5: # %else8 +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: sw a7, 4(a0) +; RV32ZVE32F-NEXT: sw t0, 8(a0) +; RV32ZVE32F-NEXT: sw a6, 12(a0) +; RV32ZVE32F-NEXT: sw a5, 16(a0) +; RV32ZVE32F-NEXT: sw a4, 20(a0) +; RV32ZVE32F-NEXT: sw a3, 24(a0) +; RV32ZVE32F-NEXT: sw a2, 28(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB45_6: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: lw a7, 4(a1) +; RV32ZVE32F-NEXT: lw a1, 0(a1) +; RV32ZVE32F-NEXT: andi t2, t1, 2 +; RV32ZVE32F-NEXT: beqz t2, .LBB45_2 +; RV32ZVE32F-NEXT: .LBB45_7: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s t0, v9 +; RV32ZVE32F-NEXT: lw a6, 4(t0) +; RV32ZVE32F-NEXT: lw t0, 0(t0) +; RV32ZVE32F-NEXT: andi t2, t1, 4 +; RV32ZVE32F-NEXT: beqz t2, .LBB45_3 +; RV32ZVE32F-NEXT: .LBB45_8: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a5, v9 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi t1, t1, 8 +; RV32ZVE32F-NEXT: bnez t1, .LBB45_4 +; RV32ZVE32F-NEXT: j .LBB45_5 +; +; RV64ZVE32F-LABEL: mgather_truemask_v4i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a3, 24(a2) +; RV64ZVE32F-NEXT: ld a4, 16(a2) +; RV64ZVE32F-NEXT: ld a5, 8(a2) +; RV64ZVE32F-NEXT: ld a2, 0(a2) +; RV64ZVE32F-NEXT: ld a6, 24(a1) +; RV64ZVE32F-NEXT: ld a7, 16(a1) +; RV64ZVE32F-NEXT: ld t0, 8(a1) +; RV64ZVE32F-NEXT: ld t1, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a1) +; RV64ZVE32F-NEXT: lb a1, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB45_6 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi t1, a1, 2 +; RV64ZVE32F-NEXT: bnez t1, .LBB45_7 +; RV64ZVE32F-NEXT: .LBB45_2: # %else2 +; RV64ZVE32F-NEXT: andi t0, a1, 4 +; RV64ZVE32F-NEXT: bnez t0, .LBB45_8 +; RV64ZVE32F-NEXT: .LBB45_3: # %else5 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB45_5 +; RV64ZVE32F-NEXT: .LBB45_4: # %cond.load7 +; RV64ZVE32F-NEXT: ld a3, 0(a6) +; RV64ZVE32F-NEXT: .LBB45_5: # %else8 +; RV64ZVE32F-NEXT: sd a2, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) +; RV64ZVE32F-NEXT: sd a4, 16(a0) +; RV64ZVE32F-NEXT: sd a3, 24(a0) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB45_6: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(t1) +; RV64ZVE32F-NEXT: andi t1, a1, 2 +; RV64ZVE32F-NEXT: beqz t1, .LBB45_2 +; RV64ZVE32F-NEXT: .LBB45_7: # %cond.load1 +; RV64ZVE32F-NEXT: ld a5, 0(t0) +; RV64ZVE32F-NEXT: andi t0, a1, 4 +; RV64ZVE32F-NEXT: beqz t0, .LBB45_3 +; RV64ZVE32F-NEXT: .LBB45_8: # %cond.load4 +; RV64ZVE32F-NEXT: ld a4, 0(a7) +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB45_4 +; RV64ZVE32F-NEXT: j .LBB45_5 %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %ptrs, i32 8, <4 x i1> %mtrue, <4 x i64> %passthru) @@ -973,10 +4385,47 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x i64*> %ptrs, <4 x i64> %passthru) } define <4 x i64> @mgather_falsemask_v4i64(<4 x i64*> %ptrs, <4 x i64> %passthru) { -; CHECK-LABEL: mgather_falsemask_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vmv2r.v v8, v10 -; CHECK-NEXT: ret +; RV32V-LABEL: mgather_falsemask_v4i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vmv2r.v v8, v10 +; RV32V-NEXT: ret +; +; RV64V-LABEL: mgather_falsemask_v4i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vmv2r.v v8, v10 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_falsemask_v4i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: lw a2, 0(a1) +; RV32ZVE32F-NEXT: lw a3, 4(a1) +; RV32ZVE32F-NEXT: lw a4, 8(a1) +; RV32ZVE32F-NEXT: lw a5, 12(a1) +; RV32ZVE32F-NEXT: lw a6, 28(a1) +; RV32ZVE32F-NEXT: lw a7, 24(a1) +; RV32ZVE32F-NEXT: lw t0, 20(a1) +; RV32ZVE32F-NEXT: lw a1, 16(a1) +; RV32ZVE32F-NEXT: sw a6, 28(a0) +; RV32ZVE32F-NEXT: sw a7, 24(a0) +; RV32ZVE32F-NEXT: sw t0, 20(a0) +; RV32ZVE32F-NEXT: sw a1, 16(a0) +; RV32ZVE32F-NEXT: sw a5, 12(a0) +; RV32ZVE32F-NEXT: sw a4, 8(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_falsemask_v4i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a1, 24(a2) +; RV64ZVE32F-NEXT: ld a3, 16(a2) +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: ld a2, 0(a2) +; RV64ZVE32F-NEXT: sd a1, 24(a0) +; RV64ZVE32F-NEXT: sd a3, 16(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a2, 0(a0) +; RV64ZVE32F-NEXT: ret %v = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %ptrs, i32 8, <4 x i1> zeroinitializer, <4 x i64> %passthru) ret <4 x i64> %v } @@ -984,68 +4433,798 @@ define <4 x i64> @mgather_falsemask_v4i64(<4 x i64*> %ptrs, <4 x i64> %passthru) declare <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*>, i32, <8 x i1>, <8 x i64>) define <8 x i64> @mgather_v8i64(<8 x i64*> %ptrs, <8 x i1> %m, <8 x i64> %passthru) { -; RV32-LABEL: mgather_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (zero), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (zero), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vluxei64.v v12, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v8i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vluxei64.v v12, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi a2, t0, 1 +; RV32ZVE32F-NEXT: beqz a2, .LBB47_9 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a2, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB47_10 +; RV32ZVE32F-NEXT: .LBB47_2: +; RV32ZVE32F-NEXT: lw a4, 12(a1) +; RV32ZVE32F-NEXT: lw a5, 8(a1) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: bnez a6, .LBB47_11 +; RV32ZVE32F-NEXT: .LBB47_3: +; RV32ZVE32F-NEXT: lw a6, 20(a1) +; RV32ZVE32F-NEXT: lw a7, 16(a1) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: bnez t1, .LBB47_12 +; RV32ZVE32F-NEXT: .LBB47_4: +; RV32ZVE32F-NEXT: lw t1, 28(a1) +; RV32ZVE32F-NEXT: lw t2, 24(a1) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: bnez t3, .LBB47_13 +; RV32ZVE32F-NEXT: .LBB47_5: +; RV32ZVE32F-NEXT: lw t3, 36(a1) +; RV32ZVE32F-NEXT: lw t4, 32(a1) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: bnez t5, .LBB47_14 +; RV32ZVE32F-NEXT: .LBB47_6: +; RV32ZVE32F-NEXT: lw t5, 44(a1) +; RV32ZVE32F-NEXT: lw t6, 40(a1) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: bnez s0, .LBB47_15 +; RV32ZVE32F-NEXT: .LBB47_7: +; RV32ZVE32F-NEXT: lw s0, 52(a1) +; RV32ZVE32F-NEXT: lw s1, 48(a1) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: bnez t0, .LBB47_16 +; RV32ZVE32F-NEXT: .LBB47_8: +; RV32ZVE32F-NEXT: lw t0, 60(a1) +; RV32ZVE32F-NEXT: lw a1, 56(a1) +; RV32ZVE32F-NEXT: j .LBB47_17 +; RV32ZVE32F-NEXT: .LBB47_9: +; RV32ZVE32F-NEXT: lw a2, 4(a1) +; RV32ZVE32F-NEXT: lw a3, 0(a1) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB47_2 +; RV32ZVE32F-NEXT: .LBB47_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: beqz a6, .LBB47_3 +; RV32ZVE32F-NEXT: .LBB47_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 +; RV32ZVE32F-NEXT: lw a6, 4(a7) +; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: beqz t1, .LBB47_4 +; RV32ZVE32F-NEXT: .LBB47_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 +; RV32ZVE32F-NEXT: lw t1, 4(t2) +; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: beqz t3, .LBB47_5 +; RV32ZVE32F-NEXT: .LBB47_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 +; RV32ZVE32F-NEXT: lw t3, 4(t4) +; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: beqz t5, .LBB47_6 +; RV32ZVE32F-NEXT: .LBB47_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 +; RV32ZVE32F-NEXT: lw t5, 4(t6) +; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: beqz s0, .LBB47_7 +; RV32ZVE32F-NEXT: .LBB47_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 +; RV32ZVE32F-NEXT: lw s0, 4(s1) +; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: beqz t0, .LBB47_8 +; RV32ZVE32F-NEXT: .LBB47_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: lw t0, 4(a1) +; RV32ZVE32F-NEXT: lw a1, 0(a1) +; RV32ZVE32F-NEXT: .LBB47_17: # %else20 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw t2, 24(a0) +; RV32ZVE32F-NEXT: sw t1, 28(a0) +; RV32ZVE32F-NEXT: sw t4, 32(a0) +; RV32ZVE32F-NEXT: sw t3, 36(a0) +; RV32ZVE32F-NEXT: sw t6, 40(a0) +; RV32ZVE32F-NEXT: sw t5, 44(a0) +; RV32ZVE32F-NEXT: sw s1, 48(a0) +; RV32ZVE32F-NEXT: sw s0, 52(a0) +; RV32ZVE32F-NEXT: sw a1, 56(a0) +; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a6, v0 +; RV64ZVE32F-NEXT: andi a3, a6, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB47_9 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: ld a3, 0(a1) +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB47_10 +; RV64ZVE32F-NEXT: .LBB47_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: andi a5, a6, 4 +; RV64ZVE32F-NEXT: bnez a5, .LBB47_11 +; RV64ZVE32F-NEXT: .LBB47_3: +; RV64ZVE32F-NEXT: ld a5, 16(a2) +; RV64ZVE32F-NEXT: andi a7, a6, 8 +; RV64ZVE32F-NEXT: bnez a7, .LBB47_12 +; RV64ZVE32F-NEXT: .LBB47_4: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a6, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB47_13 +; RV64ZVE32F-NEXT: .LBB47_5: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a6, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB47_14 +; RV64ZVE32F-NEXT: .LBB47_6: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: andi t2, a6, 64 +; RV64ZVE32F-NEXT: bnez t2, .LBB47_15 +; RV64ZVE32F-NEXT: .LBB47_7: +; RV64ZVE32F-NEXT: ld t2, 48(a2) +; RV64ZVE32F-NEXT: andi a6, a6, -128 +; RV64ZVE32F-NEXT: bnez a6, .LBB47_16 +; RV64ZVE32F-NEXT: .LBB47_8: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB47_17 +; RV64ZVE32F-NEXT: .LBB47_9: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB47_2 +; RV64ZVE32F-NEXT: .LBB47_10: # %cond.load1 +; RV64ZVE32F-NEXT: ld a4, 8(a1) +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a6, 4 +; RV64ZVE32F-NEXT: beqz a5, .LBB47_3 +; RV64ZVE32F-NEXT: .LBB47_11: # %cond.load4 +; RV64ZVE32F-NEXT: ld a5, 16(a1) +; RV64ZVE32F-NEXT: ld a5, 0(a5) +; RV64ZVE32F-NEXT: andi a7, a6, 8 +; RV64ZVE32F-NEXT: beqz a7, .LBB47_4 +; RV64ZVE32F-NEXT: .LBB47_12: # %cond.load7 +; RV64ZVE32F-NEXT: ld a7, 24(a1) +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a6, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB47_5 +; RV64ZVE32F-NEXT: .LBB47_13: # %cond.load10 +; RV64ZVE32F-NEXT: ld t0, 32(a1) +; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: andi t1, a6, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB47_6 +; RV64ZVE32F-NEXT: .LBB47_14: # %cond.load13 +; RV64ZVE32F-NEXT: ld t1, 40(a1) +; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: andi t2, a6, 64 +; RV64ZVE32F-NEXT: beqz t2, .LBB47_7 +; RV64ZVE32F-NEXT: .LBB47_15: # %cond.load16 +; RV64ZVE32F-NEXT: ld t2, 48(a1) +; RV64ZVE32F-NEXT: ld t2, 0(t2) +; RV64ZVE32F-NEXT: andi a6, a6, -128 +; RV64ZVE32F-NEXT: beqz a6, .LBB47_8 +; RV64ZVE32F-NEXT: .LBB47_16: # %cond.load19 +; RV64ZVE32F-NEXT: ld a1, 56(a1) +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB47_17: # %else20 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a5, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 24(a0) +; RV64ZVE32F-NEXT: sd t0, 32(a0) +; RV64ZVE32F-NEXT: sd t1, 40(a0) +; RV64ZVE32F-NEXT: sd t2, 48(a0) +; RV64ZVE32F-NEXT: sd a1, 56(a0) +; RV64ZVE32F-NEXT: ret %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru) ret <8 x i64> %v } define <8 x i64> @mgather_baseidx_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i64> %passthru) { -; RV32-LABEL: mgather_baseidx_v8i8_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsext.vf4 v10, v8 -; RV32-NEXT: vsll.vi v8, v10, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_v8i8_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsext.vf4 v10, v8 +; RV32V-NEXT: vsll.vi v8, v10, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i8_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i8_v8i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_v8i8_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB48_9 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a1, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB48_10 +; RV32ZVE32F-NEXT: .LBB48_2: +; RV32ZVE32F-NEXT: lw a4, 12(a2) +; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: bnez a6, .LBB48_11 +; RV32ZVE32F-NEXT: .LBB48_3: +; RV32ZVE32F-NEXT: lw a6, 20(a2) +; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: bnez t1, .LBB48_12 +; RV32ZVE32F-NEXT: .LBB48_4: +; RV32ZVE32F-NEXT: lw t1, 28(a2) +; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: bnez t3, .LBB48_13 +; RV32ZVE32F-NEXT: .LBB48_5: +; RV32ZVE32F-NEXT: lw t3, 36(a2) +; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: bnez t5, .LBB48_14 +; RV32ZVE32F-NEXT: .LBB48_6: +; RV32ZVE32F-NEXT: lw t5, 44(a2) +; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: bnez s0, .LBB48_15 +; RV32ZVE32F-NEXT: .LBB48_7: +; RV32ZVE32F-NEXT: lw s0, 52(a2) +; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: bnez t0, .LBB48_16 +; RV32ZVE32F-NEXT: .LBB48_8: +; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: j .LBB48_17 +; RV32ZVE32F-NEXT: .LBB48_9: +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB48_2 +; RV32ZVE32F-NEXT: .LBB48_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: beqz a6, .LBB48_3 +; RV32ZVE32F-NEXT: .LBB48_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 +; RV32ZVE32F-NEXT: lw a6, 4(a7) +; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: beqz t1, .LBB48_4 +; RV32ZVE32F-NEXT: .LBB48_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 +; RV32ZVE32F-NEXT: lw t1, 4(t2) +; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: beqz t3, .LBB48_5 +; RV32ZVE32F-NEXT: .LBB48_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 +; RV32ZVE32F-NEXT: lw t3, 4(t4) +; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: beqz t5, .LBB48_6 +; RV32ZVE32F-NEXT: .LBB48_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 +; RV32ZVE32F-NEXT: lw t5, 4(t6) +; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: beqz s0, .LBB48_7 +; RV32ZVE32F-NEXT: .LBB48_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 +; RV32ZVE32F-NEXT: lw s0, 4(s1) +; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: beqz t0, .LBB48_8 +; RV32ZVE32F-NEXT: .LBB48_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: .LBB48_17: # %else20 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw t2, 24(a0) +; RV32ZVE32F-NEXT: sw t1, 28(a0) +; RV32ZVE32F-NEXT: sw t4, 32(a0) +; RV32ZVE32F-NEXT: sw t3, 36(a0) +; RV32ZVE32F-NEXT: sw t6, 40(a0) +; RV32ZVE32F-NEXT: sw t5, 44(a0) +; RV32ZVE32F-NEXT: sw s1, 48(a0) +; RV32ZVE32F-NEXT: sw s0, 52(a0) +; RV32ZVE32F-NEXT: sw a2, 56(a0) +; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i8_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi a3, a5, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB48_3 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB48_4 +; RV64ZVE32F-NEXT: .LBB48_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: j .LBB48_5 +; RV64ZVE32F-NEXT: .LBB48_3: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB48_2 +; RV64ZVE32F-NEXT: .LBB48_4: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: .LBB48_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB48_7 +; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: j .LBB48_8 +; RV64ZVE32F-NEXT: .LBB48_7: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: .LBB48_8: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a7, .LBB48_12 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: slli a7, a7, 3 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB48_13 +; RV64ZVE32F-NEXT: .LBB48_10: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB48_14 +; RV64ZVE32F-NEXT: .LBB48_11: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB48_15 +; RV64ZVE32F-NEXT: .LBB48_12: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB48_10 +; RV64ZVE32F-NEXT: .LBB48_13: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: slli t0, t0, 3 +; RV64ZVE32F-NEXT: add t0, a1, t0 +; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB48_11 +; RV64ZVE32F-NEXT: .LBB48_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: slli t1, t1, 3 +; RV64ZVE32F-NEXT: add t1, a1, t1 +; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: .LBB48_15: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB48_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: ld t2, 0(t2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB48_19 +; RV64ZVE32F-NEXT: .LBB48_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB48_20 +; RV64ZVE32F-NEXT: .LBB48_18: +; RV64ZVE32F-NEXT: ld t2, 48(a2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB48_17 +; RV64ZVE32F-NEXT: .LBB48_19: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB48_20: # %else20 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 24(a0) +; RV64ZVE32F-NEXT: sd t0, 32(a0) +; RV64ZVE32F-NEXT: sd t1, 40(a0) +; RV64ZVE32F-NEXT: sd t2, 48(a0) +; RV64ZVE32F-NEXT: sd a1, 56(a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i64, i64* %base, <8 x i8> %idxs %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru) ret <8 x i64> %v } define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i64> %passthru) { -; RV32-LABEL: mgather_baseidx_sext_v8i8_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf8 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_sext_v8i8_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf8 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i8_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i8_v8i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB49_9 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a1, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB49_10 +; RV32ZVE32F-NEXT: .LBB49_2: +; RV32ZVE32F-NEXT: lw a4, 12(a2) +; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: bnez a6, .LBB49_11 +; RV32ZVE32F-NEXT: .LBB49_3: +; RV32ZVE32F-NEXT: lw a6, 20(a2) +; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: bnez t1, .LBB49_12 +; RV32ZVE32F-NEXT: .LBB49_4: +; RV32ZVE32F-NEXT: lw t1, 28(a2) +; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: bnez t3, .LBB49_13 +; RV32ZVE32F-NEXT: .LBB49_5: +; RV32ZVE32F-NEXT: lw t3, 36(a2) +; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: bnez t5, .LBB49_14 +; RV32ZVE32F-NEXT: .LBB49_6: +; RV32ZVE32F-NEXT: lw t5, 44(a2) +; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: bnez s0, .LBB49_15 +; RV32ZVE32F-NEXT: .LBB49_7: +; RV32ZVE32F-NEXT: lw s0, 52(a2) +; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: bnez t0, .LBB49_16 +; RV32ZVE32F-NEXT: .LBB49_8: +; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: j .LBB49_17 +; RV32ZVE32F-NEXT: .LBB49_9: +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB49_2 +; RV32ZVE32F-NEXT: .LBB49_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: beqz a6, .LBB49_3 +; RV32ZVE32F-NEXT: .LBB49_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 +; RV32ZVE32F-NEXT: lw a6, 4(a7) +; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: beqz t1, .LBB49_4 +; RV32ZVE32F-NEXT: .LBB49_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 +; RV32ZVE32F-NEXT: lw t1, 4(t2) +; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: beqz t3, .LBB49_5 +; RV32ZVE32F-NEXT: .LBB49_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 +; RV32ZVE32F-NEXT: lw t3, 4(t4) +; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: beqz t5, .LBB49_6 +; RV32ZVE32F-NEXT: .LBB49_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 +; RV32ZVE32F-NEXT: lw t5, 4(t6) +; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: beqz s0, .LBB49_7 +; RV32ZVE32F-NEXT: .LBB49_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 +; RV32ZVE32F-NEXT: lw s0, 4(s1) +; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: beqz t0, .LBB49_8 +; RV32ZVE32F-NEXT: .LBB49_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: .LBB49_17: # %else20 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw t2, 24(a0) +; RV32ZVE32F-NEXT: sw t1, 28(a0) +; RV32ZVE32F-NEXT: sw t4, 32(a0) +; RV32ZVE32F-NEXT: sw t3, 36(a0) +; RV32ZVE32F-NEXT: sw t6, 40(a0) +; RV32ZVE32F-NEXT: sw t5, 44(a0) +; RV32ZVE32F-NEXT: sw s1, 48(a0) +; RV32ZVE32F-NEXT: sw s0, 52(a0) +; RV32ZVE32F-NEXT: sw a2, 56(a0) +; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi a3, a5, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB49_3 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB49_4 +; RV64ZVE32F-NEXT: .LBB49_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: j .LBB49_5 +; RV64ZVE32F-NEXT: .LBB49_3: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB49_2 +; RV64ZVE32F-NEXT: .LBB49_4: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: .LBB49_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB49_7 +; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: j .LBB49_8 +; RV64ZVE32F-NEXT: .LBB49_7: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: .LBB49_8: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a7, .LBB49_12 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: slli a7, a7, 3 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB49_13 +; RV64ZVE32F-NEXT: .LBB49_10: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB49_14 +; RV64ZVE32F-NEXT: .LBB49_11: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB49_15 +; RV64ZVE32F-NEXT: .LBB49_12: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB49_10 +; RV64ZVE32F-NEXT: .LBB49_13: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: slli t0, t0, 3 +; RV64ZVE32F-NEXT: add t0, a1, t0 +; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB49_11 +; RV64ZVE32F-NEXT: .LBB49_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: slli t1, t1, 3 +; RV64ZVE32F-NEXT: add t1, a1, t1 +; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: .LBB49_15: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB49_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: ld t2, 0(t2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB49_19 +; RV64ZVE32F-NEXT: .LBB49_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB49_20 +; RV64ZVE32F-NEXT: .LBB49_18: +; RV64ZVE32F-NEXT: ld t2, 48(a2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB49_17 +; RV64ZVE32F-NEXT: .LBB49_19: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB49_20: # %else20 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 24(a0) +; RV64ZVE32F-NEXT: sd t0, 32(a0) +; RV64ZVE32F-NEXT: sd t1, 40(a0) +; RV64ZVE32F-NEXT: sd t2, 48(a0) +; RV64ZVE32F-NEXT: sd a1, 56(a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru) @@ -1053,26 +5232,289 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8 } define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i64> %passthru) { -; RV32-LABEL: mgather_baseidx_zext_v8i8_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf8 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_zext_v8i8_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf8 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i8_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf8 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf8 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_zext_v8i8_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vzext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB50_9 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a1, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB50_10 +; RV32ZVE32F-NEXT: .LBB50_2: +; RV32ZVE32F-NEXT: lw a4, 12(a2) +; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: bnez a6, .LBB50_11 +; RV32ZVE32F-NEXT: .LBB50_3: +; RV32ZVE32F-NEXT: lw a6, 20(a2) +; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: bnez t1, .LBB50_12 +; RV32ZVE32F-NEXT: .LBB50_4: +; RV32ZVE32F-NEXT: lw t1, 28(a2) +; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: bnez t3, .LBB50_13 +; RV32ZVE32F-NEXT: .LBB50_5: +; RV32ZVE32F-NEXT: lw t3, 36(a2) +; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: bnez t5, .LBB50_14 +; RV32ZVE32F-NEXT: .LBB50_6: +; RV32ZVE32F-NEXT: lw t5, 44(a2) +; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: bnez s0, .LBB50_15 +; RV32ZVE32F-NEXT: .LBB50_7: +; RV32ZVE32F-NEXT: lw s0, 52(a2) +; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: bnez t0, .LBB50_16 +; RV32ZVE32F-NEXT: .LBB50_8: +; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: j .LBB50_17 +; RV32ZVE32F-NEXT: .LBB50_9: +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB50_2 +; RV32ZVE32F-NEXT: .LBB50_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: beqz a6, .LBB50_3 +; RV32ZVE32F-NEXT: .LBB50_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 +; RV32ZVE32F-NEXT: lw a6, 4(a7) +; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: beqz t1, .LBB50_4 +; RV32ZVE32F-NEXT: .LBB50_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 +; RV32ZVE32F-NEXT: lw t1, 4(t2) +; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: beqz t3, .LBB50_5 +; RV32ZVE32F-NEXT: .LBB50_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 +; RV32ZVE32F-NEXT: lw t3, 4(t4) +; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: beqz t5, .LBB50_6 +; RV32ZVE32F-NEXT: .LBB50_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 +; RV32ZVE32F-NEXT: lw t5, 4(t6) +; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: beqz s0, .LBB50_7 +; RV32ZVE32F-NEXT: .LBB50_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 +; RV32ZVE32F-NEXT: lw s0, 4(s1) +; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: beqz t0, .LBB50_8 +; RV32ZVE32F-NEXT: .LBB50_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: .LBB50_17: # %else20 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw t2, 24(a0) +; RV32ZVE32F-NEXT: sw t1, 28(a0) +; RV32ZVE32F-NEXT: sw t4, 32(a0) +; RV32ZVE32F-NEXT: sw t3, 36(a0) +; RV32ZVE32F-NEXT: sw t6, 40(a0) +; RV32ZVE32F-NEXT: sw t5, 44(a0) +; RV32ZVE32F-NEXT: sw s1, 48(a0) +; RV32ZVE32F-NEXT: sw s0, 52(a0) +; RV32ZVE32F-NEXT: sw a2, 56(a0) +; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i8_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi a3, a5, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB50_3 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: andi a3, a3, 255 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB50_4 +; RV64ZVE32F-NEXT: .LBB50_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: j .LBB50_5 +; RV64ZVE32F-NEXT: .LBB50_3: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB50_2 +; RV64ZVE32F-NEXT: .LBB50_4: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: andi a4, a4, 255 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: .LBB50_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB50_7 +; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: andi a6, a6, 255 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: j .LBB50_8 +; RV64ZVE32F-NEXT: .LBB50_7: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: .LBB50_8: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a7, .LBB50_12 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: andi a7, a7, 255 +; RV64ZVE32F-NEXT: slli a7, a7, 3 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB50_13 +; RV64ZVE32F-NEXT: .LBB50_10: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB50_14 +; RV64ZVE32F-NEXT: .LBB50_11: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB50_15 +; RV64ZVE32F-NEXT: .LBB50_12: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB50_10 +; RV64ZVE32F-NEXT: .LBB50_13: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: andi t0, t0, 255 +; RV64ZVE32F-NEXT: slli t0, t0, 3 +; RV64ZVE32F-NEXT: add t0, a1, t0 +; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB50_11 +; RV64ZVE32F-NEXT: .LBB50_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: andi t1, t1, 255 +; RV64ZVE32F-NEXT: slli t1, t1, 3 +; RV64ZVE32F-NEXT: add t1, a1, t1 +; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: .LBB50_15: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB50_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: andi t2, t2, 255 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: ld t2, 0(t2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB50_19 +; RV64ZVE32F-NEXT: .LBB50_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB50_20 +; RV64ZVE32F-NEXT: .LBB50_18: +; RV64ZVE32F-NEXT: ld t2, 48(a2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB50_17 +; RV64ZVE32F-NEXT: .LBB50_19: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB50_20: # %else20 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 24(a0) +; RV64ZVE32F-NEXT: sd t0, 32(a0) +; RV64ZVE32F-NEXT: sd t1, 40(a0) +; RV64ZVE32F-NEXT: sd t2, 48(a0) +; RV64ZVE32F-NEXT: sd a1, 56(a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru) @@ -1080,50 +5522,562 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(i64* %base, <8 x i8> %idxs, <8 } define <8 x i64> @mgather_baseidx_v8i16_v8i64(i64* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) { -; RV32-LABEL: mgather_baseidx_v8i16_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsext.vf2 v10, v8 -; RV32-NEXT: vsll.vi v8, v10, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_v8i16_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsext.vf2 v10, v8 +; RV32V-NEXT: vsll.vi v8, v10, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i16_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf4 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i16_v8i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf4 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_v8i16_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB51_9 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a1, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB51_10 +; RV32ZVE32F-NEXT: .LBB51_2: +; RV32ZVE32F-NEXT: lw a4, 12(a2) +; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: bnez a6, .LBB51_11 +; RV32ZVE32F-NEXT: .LBB51_3: +; RV32ZVE32F-NEXT: lw a6, 20(a2) +; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: bnez t1, .LBB51_12 +; RV32ZVE32F-NEXT: .LBB51_4: +; RV32ZVE32F-NEXT: lw t1, 28(a2) +; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: bnez t3, .LBB51_13 +; RV32ZVE32F-NEXT: .LBB51_5: +; RV32ZVE32F-NEXT: lw t3, 36(a2) +; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: bnez t5, .LBB51_14 +; RV32ZVE32F-NEXT: .LBB51_6: +; RV32ZVE32F-NEXT: lw t5, 44(a2) +; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: bnez s0, .LBB51_15 +; RV32ZVE32F-NEXT: .LBB51_7: +; RV32ZVE32F-NEXT: lw s0, 52(a2) +; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: bnez t0, .LBB51_16 +; RV32ZVE32F-NEXT: .LBB51_8: +; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: j .LBB51_17 +; RV32ZVE32F-NEXT: .LBB51_9: +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB51_2 +; RV32ZVE32F-NEXT: .LBB51_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: beqz a6, .LBB51_3 +; RV32ZVE32F-NEXT: .LBB51_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 +; RV32ZVE32F-NEXT: lw a6, 4(a7) +; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: beqz t1, .LBB51_4 +; RV32ZVE32F-NEXT: .LBB51_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 +; RV32ZVE32F-NEXT: lw t1, 4(t2) +; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: beqz t3, .LBB51_5 +; RV32ZVE32F-NEXT: .LBB51_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 +; RV32ZVE32F-NEXT: lw t3, 4(t4) +; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: beqz t5, .LBB51_6 +; RV32ZVE32F-NEXT: .LBB51_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 +; RV32ZVE32F-NEXT: lw t5, 4(t6) +; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: beqz s0, .LBB51_7 +; RV32ZVE32F-NEXT: .LBB51_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 +; RV32ZVE32F-NEXT: lw s0, 4(s1) +; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: beqz t0, .LBB51_8 +; RV32ZVE32F-NEXT: .LBB51_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: .LBB51_17: # %else20 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw t2, 24(a0) +; RV32ZVE32F-NEXT: sw t1, 28(a0) +; RV32ZVE32F-NEXT: sw t4, 32(a0) +; RV32ZVE32F-NEXT: sw t3, 36(a0) +; RV32ZVE32F-NEXT: sw t6, 40(a0) +; RV32ZVE32F-NEXT: sw t5, 44(a0) +; RV32ZVE32F-NEXT: sw s1, 48(a0) +; RV32ZVE32F-NEXT: sw s0, 52(a0) +; RV32ZVE32F-NEXT: sw a2, 56(a0) +; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i16_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi a3, a5, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB51_3 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB51_4 +; RV64ZVE32F-NEXT: .LBB51_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: j .LBB51_5 +; RV64ZVE32F-NEXT: .LBB51_3: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB51_2 +; RV64ZVE32F-NEXT: .LBB51_4: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: .LBB51_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB51_7 +; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: j .LBB51_8 +; RV64ZVE32F-NEXT: .LBB51_7: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: .LBB51_8: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a7, .LBB51_12 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: slli a7, a7, 3 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB51_13 +; RV64ZVE32F-NEXT: .LBB51_10: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB51_14 +; RV64ZVE32F-NEXT: .LBB51_11: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB51_15 +; RV64ZVE32F-NEXT: .LBB51_12: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB51_10 +; RV64ZVE32F-NEXT: .LBB51_13: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: slli t0, t0, 3 +; RV64ZVE32F-NEXT: add t0, a1, t0 +; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB51_11 +; RV64ZVE32F-NEXT: .LBB51_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: slli t1, t1, 3 +; RV64ZVE32F-NEXT: add t1, a1, t1 +; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: .LBB51_15: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB51_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: ld t2, 0(t2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB51_19 +; RV64ZVE32F-NEXT: .LBB51_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB51_20 +; RV64ZVE32F-NEXT: .LBB51_18: +; RV64ZVE32F-NEXT: ld t2, 48(a2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB51_17 +; RV64ZVE32F-NEXT: .LBB51_19: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB51_20: # %else20 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 24(a0) +; RV64ZVE32F-NEXT: sd t0, 32(a0) +; RV64ZVE32F-NEXT: sd t1, 40(a0) +; RV64ZVE32F-NEXT: sd t2, 48(a0) +; RV64ZVE32F-NEXT: sd a1, 56(a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i64, i64* %base, <8 x i16> %idxs %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru) ret <8 x i64> %v } define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(i64* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) { -; RV32-LABEL: mgather_baseidx_sext_v8i16_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf4 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_sext_v8i16_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf4 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i16_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf4 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i16_v8i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf4 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_sext_v8i16_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB52_9 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a1, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB52_10 +; RV32ZVE32F-NEXT: .LBB52_2: +; RV32ZVE32F-NEXT: lw a4, 12(a2) +; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: bnez a6, .LBB52_11 +; RV32ZVE32F-NEXT: .LBB52_3: +; RV32ZVE32F-NEXT: lw a6, 20(a2) +; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: bnez t1, .LBB52_12 +; RV32ZVE32F-NEXT: .LBB52_4: +; RV32ZVE32F-NEXT: lw t1, 28(a2) +; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: bnez t3, .LBB52_13 +; RV32ZVE32F-NEXT: .LBB52_5: +; RV32ZVE32F-NEXT: lw t3, 36(a2) +; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: bnez t5, .LBB52_14 +; RV32ZVE32F-NEXT: .LBB52_6: +; RV32ZVE32F-NEXT: lw t5, 44(a2) +; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: bnez s0, .LBB52_15 +; RV32ZVE32F-NEXT: .LBB52_7: +; RV32ZVE32F-NEXT: lw s0, 52(a2) +; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: bnez t0, .LBB52_16 +; RV32ZVE32F-NEXT: .LBB52_8: +; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: j .LBB52_17 +; RV32ZVE32F-NEXT: .LBB52_9: +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB52_2 +; RV32ZVE32F-NEXT: .LBB52_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: beqz a6, .LBB52_3 +; RV32ZVE32F-NEXT: .LBB52_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 +; RV32ZVE32F-NEXT: lw a6, 4(a7) +; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: beqz t1, .LBB52_4 +; RV32ZVE32F-NEXT: .LBB52_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 +; RV32ZVE32F-NEXT: lw t1, 4(t2) +; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: beqz t3, .LBB52_5 +; RV32ZVE32F-NEXT: .LBB52_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 +; RV32ZVE32F-NEXT: lw t3, 4(t4) +; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: beqz t5, .LBB52_6 +; RV32ZVE32F-NEXT: .LBB52_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 +; RV32ZVE32F-NEXT: lw t5, 4(t6) +; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: beqz s0, .LBB52_7 +; RV32ZVE32F-NEXT: .LBB52_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 +; RV32ZVE32F-NEXT: lw s0, 4(s1) +; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: beqz t0, .LBB52_8 +; RV32ZVE32F-NEXT: .LBB52_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: .LBB52_17: # %else20 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw t2, 24(a0) +; RV32ZVE32F-NEXT: sw t1, 28(a0) +; RV32ZVE32F-NEXT: sw t4, 32(a0) +; RV32ZVE32F-NEXT: sw t3, 36(a0) +; RV32ZVE32F-NEXT: sw t6, 40(a0) +; RV32ZVE32F-NEXT: sw t5, 44(a0) +; RV32ZVE32F-NEXT: sw s1, 48(a0) +; RV32ZVE32F-NEXT: sw s0, 52(a0) +; RV32ZVE32F-NEXT: sw a2, 56(a0) +; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i16_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi a3, a5, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB52_3 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB52_4 +; RV64ZVE32F-NEXT: .LBB52_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: j .LBB52_5 +; RV64ZVE32F-NEXT: .LBB52_3: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB52_2 +; RV64ZVE32F-NEXT: .LBB52_4: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: .LBB52_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB52_7 +; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: j .LBB52_8 +; RV64ZVE32F-NEXT: .LBB52_7: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: .LBB52_8: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a7, .LBB52_12 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: slli a7, a7, 3 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB52_13 +; RV64ZVE32F-NEXT: .LBB52_10: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB52_14 +; RV64ZVE32F-NEXT: .LBB52_11: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB52_15 +; RV64ZVE32F-NEXT: .LBB52_12: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB52_10 +; RV64ZVE32F-NEXT: .LBB52_13: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: slli t0, t0, 3 +; RV64ZVE32F-NEXT: add t0, a1, t0 +; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB52_11 +; RV64ZVE32F-NEXT: .LBB52_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: slli t1, t1, 3 +; RV64ZVE32F-NEXT: add t1, a1, t1 +; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: .LBB52_15: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB52_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: ld t2, 0(t2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB52_19 +; RV64ZVE32F-NEXT: .LBB52_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB52_20 +; RV64ZVE32F-NEXT: .LBB52_18: +; RV64ZVE32F-NEXT: ld t2, 48(a2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB52_17 +; RV64ZVE32F-NEXT: .LBB52_19: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB52_20: # %else20 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 24(a0) +; RV64ZVE32F-NEXT: sd t0, 32(a0) +; RV64ZVE32F-NEXT: sd t1, 40(a0) +; RV64ZVE32F-NEXT: sd t2, 48(a0) +; RV64ZVE32F-NEXT: sd a1, 56(a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru) @@ -1131,26 +6085,292 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(i64* %base, <8 x i16> %idxs, } define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(i64* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) { -; RV32-LABEL: mgather_baseidx_zext_v8i16_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf4 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_zext_v8i16_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf4 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i16_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf4 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf4 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB53_9 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a1, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB53_10 +; RV32ZVE32F-NEXT: .LBB53_2: +; RV32ZVE32F-NEXT: lw a4, 12(a2) +; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: bnez a6, .LBB53_11 +; RV32ZVE32F-NEXT: .LBB53_3: +; RV32ZVE32F-NEXT: lw a6, 20(a2) +; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: bnez t1, .LBB53_12 +; RV32ZVE32F-NEXT: .LBB53_4: +; RV32ZVE32F-NEXT: lw t1, 28(a2) +; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: bnez t3, .LBB53_13 +; RV32ZVE32F-NEXT: .LBB53_5: +; RV32ZVE32F-NEXT: lw t3, 36(a2) +; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: bnez t5, .LBB53_14 +; RV32ZVE32F-NEXT: .LBB53_6: +; RV32ZVE32F-NEXT: lw t5, 44(a2) +; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: bnez s0, .LBB53_15 +; RV32ZVE32F-NEXT: .LBB53_7: +; RV32ZVE32F-NEXT: lw s0, 52(a2) +; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: bnez t0, .LBB53_16 +; RV32ZVE32F-NEXT: .LBB53_8: +; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: j .LBB53_17 +; RV32ZVE32F-NEXT: .LBB53_9: +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB53_2 +; RV32ZVE32F-NEXT: .LBB53_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: beqz a6, .LBB53_3 +; RV32ZVE32F-NEXT: .LBB53_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 +; RV32ZVE32F-NEXT: lw a6, 4(a7) +; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: beqz t1, .LBB53_4 +; RV32ZVE32F-NEXT: .LBB53_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 +; RV32ZVE32F-NEXT: lw t1, 4(t2) +; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: beqz t3, .LBB53_5 +; RV32ZVE32F-NEXT: .LBB53_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 +; RV32ZVE32F-NEXT: lw t3, 4(t4) +; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: beqz t5, .LBB53_6 +; RV32ZVE32F-NEXT: .LBB53_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 +; RV32ZVE32F-NEXT: lw t5, 4(t6) +; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: beqz s0, .LBB53_7 +; RV32ZVE32F-NEXT: .LBB53_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 +; RV32ZVE32F-NEXT: lw s0, 4(s1) +; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: beqz t0, .LBB53_8 +; RV32ZVE32F-NEXT: .LBB53_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: .LBB53_17: # %else20 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw t2, 24(a0) +; RV32ZVE32F-NEXT: sw t1, 28(a0) +; RV32ZVE32F-NEXT: sw t4, 32(a0) +; RV32ZVE32F-NEXT: sw t3, 36(a0) +; RV32ZVE32F-NEXT: sw t6, 40(a0) +; RV32ZVE32F-NEXT: sw t5, 44(a0) +; RV32ZVE32F-NEXT: sw s1, 48(a0) +; RV32ZVE32F-NEXT: sw s0, 52(a0) +; RV32ZVE32F-NEXT: sw a2, 56(a0) +; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: lui a3, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a6, v0 +; RV64ZVE32F-NEXT: andi a4, a6, 1 +; RV64ZVE32F-NEXT: addiw a5, a3, -1 +; RV64ZVE32F-NEXT: beqz a4, .LBB53_3 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a5 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB53_4 +; RV64ZVE32F-NEXT: .LBB53_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: j .LBB53_5 +; RV64ZVE32F-NEXT: .LBB53_3: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB53_2 +; RV64ZVE32F-NEXT: .LBB53_4: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: and a4, a4, a5 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: .LBB53_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a7, a6, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a7, .LBB53_7 +; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: and a7, a7, a5 +; RV64ZVE32F-NEXT: slli a7, a7, 3 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: j .LBB53_8 +; RV64ZVE32F-NEXT: .LBB53_7: +; RV64ZVE32F-NEXT: ld a7, 16(a2) +; RV64ZVE32F-NEXT: .LBB53_8: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi t0, a6, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz t0, .LBB53_12 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t0, v9 +; RV64ZVE32F-NEXT: and t0, t0, a5 +; RV64ZVE32F-NEXT: slli t0, t0, 3 +; RV64ZVE32F-NEXT: add t0, a1, t0 +; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: andi t1, a6, 16 +; RV64ZVE32F-NEXT: bnez t1, .LBB53_13 +; RV64ZVE32F-NEXT: .LBB53_10: +; RV64ZVE32F-NEXT: ld t1, 32(a2) +; RV64ZVE32F-NEXT: andi t2, a6, 32 +; RV64ZVE32F-NEXT: bnez t2, .LBB53_14 +; RV64ZVE32F-NEXT: .LBB53_11: +; RV64ZVE32F-NEXT: ld t2, 40(a2) +; RV64ZVE32F-NEXT: j .LBB53_15 +; RV64ZVE32F-NEXT: .LBB53_12: +; RV64ZVE32F-NEXT: ld t0, 24(a2) +; RV64ZVE32F-NEXT: andi t1, a6, 16 +; RV64ZVE32F-NEXT: beqz t1, .LBB53_10 +; RV64ZVE32F-NEXT: .LBB53_13: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: and t1, t1, a5 +; RV64ZVE32F-NEXT: slli t1, t1, 3 +; RV64ZVE32F-NEXT: add t1, a1, t1 +; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: andi t2, a6, 32 +; RV64ZVE32F-NEXT: beqz t2, .LBB53_11 +; RV64ZVE32F-NEXT: .LBB53_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t2, v9 +; RV64ZVE32F-NEXT: and t2, t2, a5 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: ld t2, 0(t2) +; RV64ZVE32F-NEXT: .LBB53_15: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi t3, a6, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t3, .LBB53_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s t3, v8 +; RV64ZVE32F-NEXT: and t3, t3, a5 +; RV64ZVE32F-NEXT: slli t3, t3, 3 +; RV64ZVE32F-NEXT: add t3, a1, t3 +; RV64ZVE32F-NEXT: ld t3, 0(t3) +; RV64ZVE32F-NEXT: andi a6, a6, -128 +; RV64ZVE32F-NEXT: bnez a6, .LBB53_19 +; RV64ZVE32F-NEXT: .LBB53_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB53_20 +; RV64ZVE32F-NEXT: .LBB53_18: +; RV64ZVE32F-NEXT: ld t3, 48(a2) +; RV64ZVE32F-NEXT: andi a6, a6, -128 +; RV64ZVE32F-NEXT: beqz a6, .LBB53_17 +; RV64ZVE32F-NEXT: .LBB53_19: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: and a2, a2, a5 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB53_20: # %else20 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a7, 16(a0) +; RV64ZVE32F-NEXT: sd t0, 24(a0) +; RV64ZVE32F-NEXT: sd t1, 32(a0) +; RV64ZVE32F-NEXT: sd t2, 40(a0) +; RV64ZVE32F-NEXT: sd t3, 48(a0) +; RV64ZVE32F-NEXT: sd a1, 56(a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru) @@ -1158,49 +6378,559 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(i64* %base, <8 x i16> %idxs, } define <8 x i64> @mgather_baseidx_v8i32_v8i64(i64* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x i64> %passthru) { -; RV32-LABEL: mgather_baseidx_v8i32_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsll.vi v8, v8, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_v8i32_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsll.vi v8, v8, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i32_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf2 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i32_v8i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_v8i32_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB54_9 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a1, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB54_10 +; RV32ZVE32F-NEXT: .LBB54_2: +; RV32ZVE32F-NEXT: lw a4, 12(a2) +; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: bnez a6, .LBB54_11 +; RV32ZVE32F-NEXT: .LBB54_3: +; RV32ZVE32F-NEXT: lw a6, 20(a2) +; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: bnez t1, .LBB54_12 +; RV32ZVE32F-NEXT: .LBB54_4: +; RV32ZVE32F-NEXT: lw t1, 28(a2) +; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: bnez t3, .LBB54_13 +; RV32ZVE32F-NEXT: .LBB54_5: +; RV32ZVE32F-NEXT: lw t3, 36(a2) +; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: bnez t5, .LBB54_14 +; RV32ZVE32F-NEXT: .LBB54_6: +; RV32ZVE32F-NEXT: lw t5, 44(a2) +; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: bnez s0, .LBB54_15 +; RV32ZVE32F-NEXT: .LBB54_7: +; RV32ZVE32F-NEXT: lw s0, 52(a2) +; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: bnez t0, .LBB54_16 +; RV32ZVE32F-NEXT: .LBB54_8: +; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: j .LBB54_17 +; RV32ZVE32F-NEXT: .LBB54_9: +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB54_2 +; RV32ZVE32F-NEXT: .LBB54_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: beqz a6, .LBB54_3 +; RV32ZVE32F-NEXT: .LBB54_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 +; RV32ZVE32F-NEXT: lw a6, 4(a7) +; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: beqz t1, .LBB54_4 +; RV32ZVE32F-NEXT: .LBB54_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 +; RV32ZVE32F-NEXT: lw t1, 4(t2) +; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: beqz t3, .LBB54_5 +; RV32ZVE32F-NEXT: .LBB54_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 +; RV32ZVE32F-NEXT: lw t3, 4(t4) +; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: beqz t5, .LBB54_6 +; RV32ZVE32F-NEXT: .LBB54_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 +; RV32ZVE32F-NEXT: lw t5, 4(t6) +; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: beqz s0, .LBB54_7 +; RV32ZVE32F-NEXT: .LBB54_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 +; RV32ZVE32F-NEXT: lw s0, 4(s1) +; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: beqz t0, .LBB54_8 +; RV32ZVE32F-NEXT: .LBB54_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: .LBB54_17: # %else20 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw t2, 24(a0) +; RV32ZVE32F-NEXT: sw t1, 28(a0) +; RV32ZVE32F-NEXT: sw t4, 32(a0) +; RV32ZVE32F-NEXT: sw t3, 36(a0) +; RV32ZVE32F-NEXT: sw t6, 40(a0) +; RV32ZVE32F-NEXT: sw t5, 44(a0) +; RV32ZVE32F-NEXT: sw s1, 48(a0) +; RV32ZVE32F-NEXT: sw s0, 52(a0) +; RV32ZVE32F-NEXT: sw a2, 56(a0) +; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i32_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi a3, a5, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB54_3 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB54_4 +; RV64ZVE32F-NEXT: .LBB54_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: j .LBB54_5 +; RV64ZVE32F-NEXT: .LBB54_3: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB54_2 +; RV64ZVE32F-NEXT: .LBB54_4: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: .LBB54_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB54_10 +; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: bnez a7, .LBB54_11 +; RV64ZVE32F-NEXT: .LBB54_7: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB54_12 +; RV64ZVE32F-NEXT: .LBB54_8: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB54_13 +; RV64ZVE32F-NEXT: .LBB54_9: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB54_14 +; RV64ZVE32F-NEXT: .LBB54_10: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: beqz a7, .LBB54_7 +; RV64ZVE32F-NEXT: .LBB54_11: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: slli a7, a7, 3 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB54_8 +; RV64ZVE32F-NEXT: .LBB54_12: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t0, v10 +; RV64ZVE32F-NEXT: slli t0, t0, 3 +; RV64ZVE32F-NEXT: add t0, a1, t0 +; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB54_9 +; RV64ZVE32F-NEXT: .LBB54_13: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: slli t1, t1, 3 +; RV64ZVE32F-NEXT: add t1, a1, t1 +; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: .LBB54_14: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB54_17 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: ld t2, 0(t2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB54_18 +; RV64ZVE32F-NEXT: .LBB54_16: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB54_19 +; RV64ZVE32F-NEXT: .LBB54_17: +; RV64ZVE32F-NEXT: ld t2, 48(a2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB54_16 +; RV64ZVE32F-NEXT: .LBB54_18: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB54_19: # %else20 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 24(a0) +; RV64ZVE32F-NEXT: sd t0, 32(a0) +; RV64ZVE32F-NEXT: sd t1, 40(a0) +; RV64ZVE32F-NEXT: sd t2, 48(a0) +; RV64ZVE32F-NEXT: sd a1, 56(a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i64, i64* %base, <8 x i32> %idxs %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru) ret <8 x i64> %v } define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(i64* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x i64> %passthru) { -; RV32-LABEL: mgather_baseidx_sext_v8i32_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_sext_v8i32_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf2 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i32_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf2 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i32_v8i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_sext_v8i32_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB55_9 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a1, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB55_10 +; RV32ZVE32F-NEXT: .LBB55_2: +; RV32ZVE32F-NEXT: lw a4, 12(a2) +; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: bnez a6, .LBB55_11 +; RV32ZVE32F-NEXT: .LBB55_3: +; RV32ZVE32F-NEXT: lw a6, 20(a2) +; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: bnez t1, .LBB55_12 +; RV32ZVE32F-NEXT: .LBB55_4: +; RV32ZVE32F-NEXT: lw t1, 28(a2) +; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: bnez t3, .LBB55_13 +; RV32ZVE32F-NEXT: .LBB55_5: +; RV32ZVE32F-NEXT: lw t3, 36(a2) +; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: bnez t5, .LBB55_14 +; RV32ZVE32F-NEXT: .LBB55_6: +; RV32ZVE32F-NEXT: lw t5, 44(a2) +; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: bnez s0, .LBB55_15 +; RV32ZVE32F-NEXT: .LBB55_7: +; RV32ZVE32F-NEXT: lw s0, 52(a2) +; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: bnez t0, .LBB55_16 +; RV32ZVE32F-NEXT: .LBB55_8: +; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: j .LBB55_17 +; RV32ZVE32F-NEXT: .LBB55_9: +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB55_2 +; RV32ZVE32F-NEXT: .LBB55_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: beqz a6, .LBB55_3 +; RV32ZVE32F-NEXT: .LBB55_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 +; RV32ZVE32F-NEXT: lw a6, 4(a7) +; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: beqz t1, .LBB55_4 +; RV32ZVE32F-NEXT: .LBB55_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 +; RV32ZVE32F-NEXT: lw t1, 4(t2) +; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: beqz t3, .LBB55_5 +; RV32ZVE32F-NEXT: .LBB55_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 +; RV32ZVE32F-NEXT: lw t3, 4(t4) +; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: beqz t5, .LBB55_6 +; RV32ZVE32F-NEXT: .LBB55_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 +; RV32ZVE32F-NEXT: lw t5, 4(t6) +; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: beqz s0, .LBB55_7 +; RV32ZVE32F-NEXT: .LBB55_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 +; RV32ZVE32F-NEXT: lw s0, 4(s1) +; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: beqz t0, .LBB55_8 +; RV32ZVE32F-NEXT: .LBB55_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: .LBB55_17: # %else20 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw t2, 24(a0) +; RV32ZVE32F-NEXT: sw t1, 28(a0) +; RV32ZVE32F-NEXT: sw t4, 32(a0) +; RV32ZVE32F-NEXT: sw t3, 36(a0) +; RV32ZVE32F-NEXT: sw t6, 40(a0) +; RV32ZVE32F-NEXT: sw t5, 44(a0) +; RV32ZVE32F-NEXT: sw s1, 48(a0) +; RV32ZVE32F-NEXT: sw s0, 52(a0) +; RV32ZVE32F-NEXT: sw a2, 56(a0) +; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i32_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi a3, a5, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB55_3 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB55_4 +; RV64ZVE32F-NEXT: .LBB55_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: j .LBB55_5 +; RV64ZVE32F-NEXT: .LBB55_3: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB55_2 +; RV64ZVE32F-NEXT: .LBB55_4: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: .LBB55_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB55_10 +; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: bnez a7, .LBB55_11 +; RV64ZVE32F-NEXT: .LBB55_7: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB55_12 +; RV64ZVE32F-NEXT: .LBB55_8: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB55_13 +; RV64ZVE32F-NEXT: .LBB55_9: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB55_14 +; RV64ZVE32F-NEXT: .LBB55_10: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: beqz a7, .LBB55_7 +; RV64ZVE32F-NEXT: .LBB55_11: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: slli a7, a7, 3 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB55_8 +; RV64ZVE32F-NEXT: .LBB55_12: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t0, v10 +; RV64ZVE32F-NEXT: slli t0, t0, 3 +; RV64ZVE32F-NEXT: add t0, a1, t0 +; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB55_9 +; RV64ZVE32F-NEXT: .LBB55_13: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: slli t1, t1, 3 +; RV64ZVE32F-NEXT: add t1, a1, t1 +; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: .LBB55_14: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB55_17 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: ld t2, 0(t2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB55_18 +; RV64ZVE32F-NEXT: .LBB55_16: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB55_19 +; RV64ZVE32F-NEXT: .LBB55_17: +; RV64ZVE32F-NEXT: ld t2, 48(a2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB55_16 +; RV64ZVE32F-NEXT: .LBB55_18: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB55_19: # %else20 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 24(a0) +; RV64ZVE32F-NEXT: sd t0, 32(a0) +; RV64ZVE32F-NEXT: sd t1, 40(a0) +; RV64ZVE32F-NEXT: sd t2, 48(a0) +; RV64ZVE32F-NEXT: sd a1, 56(a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru) @@ -1208,26 +6938,289 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(i64* %base, <8 x i32> %idxs, } define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(i64* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x i64> %passthru) { -; RV32-LABEL: mgather_baseidx_zext_v8i32_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_zext_v8i32_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf2 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i32_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf2 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i32_v8i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_zext_v8i32_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB56_9 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 +; RV32ZVE32F-NEXT: lw a1, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: bnez a4, .LBB56_10 +; RV32ZVE32F-NEXT: .LBB56_2: +; RV32ZVE32F-NEXT: lw a4, 12(a2) +; RV32ZVE32F-NEXT: lw a5, 8(a2) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: bnez a6, .LBB56_11 +; RV32ZVE32F-NEXT: .LBB56_3: +; RV32ZVE32F-NEXT: lw a6, 20(a2) +; RV32ZVE32F-NEXT: lw a7, 16(a2) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: bnez t1, .LBB56_12 +; RV32ZVE32F-NEXT: .LBB56_4: +; RV32ZVE32F-NEXT: lw t1, 28(a2) +; RV32ZVE32F-NEXT: lw t2, 24(a2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: bnez t3, .LBB56_13 +; RV32ZVE32F-NEXT: .LBB56_5: +; RV32ZVE32F-NEXT: lw t3, 36(a2) +; RV32ZVE32F-NEXT: lw t4, 32(a2) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: bnez t5, .LBB56_14 +; RV32ZVE32F-NEXT: .LBB56_6: +; RV32ZVE32F-NEXT: lw t5, 44(a2) +; RV32ZVE32F-NEXT: lw t6, 40(a2) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: bnez s0, .LBB56_15 +; RV32ZVE32F-NEXT: .LBB56_7: +; RV32ZVE32F-NEXT: lw s0, 52(a2) +; RV32ZVE32F-NEXT: lw s1, 48(a2) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: bnez t0, .LBB56_16 +; RV32ZVE32F-NEXT: .LBB56_8: +; RV32ZVE32F-NEXT: lw t0, 60(a2) +; RV32ZVE32F-NEXT: lw a2, 56(a2) +; RV32ZVE32F-NEXT: j .LBB56_17 +; RV32ZVE32F-NEXT: .LBB56_9: +; RV32ZVE32F-NEXT: lw a1, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: andi a4, t0, 2 +; RV32ZVE32F-NEXT: beqz a4, .LBB56_2 +; RV32ZVE32F-NEXT: .LBB56_10: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 +; RV32ZVE32F-NEXT: lw a4, 4(a5) +; RV32ZVE32F-NEXT: lw a5, 0(a5) +; RV32ZVE32F-NEXT: andi a6, t0, 4 +; RV32ZVE32F-NEXT: beqz a6, .LBB56_3 +; RV32ZVE32F-NEXT: .LBB56_11: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 +; RV32ZVE32F-NEXT: lw a6, 4(a7) +; RV32ZVE32F-NEXT: lw a7, 0(a7) +; RV32ZVE32F-NEXT: andi t1, t0, 8 +; RV32ZVE32F-NEXT: beqz t1, .LBB56_4 +; RV32ZVE32F-NEXT: .LBB56_12: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 +; RV32ZVE32F-NEXT: lw t1, 4(t2) +; RV32ZVE32F-NEXT: lw t2, 0(t2) +; RV32ZVE32F-NEXT: andi t3, t0, 16 +; RV32ZVE32F-NEXT: beqz t3, .LBB56_5 +; RV32ZVE32F-NEXT: .LBB56_13: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 +; RV32ZVE32F-NEXT: lw t3, 4(t4) +; RV32ZVE32F-NEXT: lw t4, 0(t4) +; RV32ZVE32F-NEXT: andi t5, t0, 32 +; RV32ZVE32F-NEXT: beqz t5, .LBB56_6 +; RV32ZVE32F-NEXT: .LBB56_14: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 +; RV32ZVE32F-NEXT: lw t5, 4(t6) +; RV32ZVE32F-NEXT: lw t6, 0(t6) +; RV32ZVE32F-NEXT: andi s0, t0, 64 +; RV32ZVE32F-NEXT: beqz s0, .LBB56_7 +; RV32ZVE32F-NEXT: .LBB56_15: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 +; RV32ZVE32F-NEXT: lw s0, 4(s1) +; RV32ZVE32F-NEXT: lw s1, 0(s1) +; RV32ZVE32F-NEXT: andi t0, t0, -128 +; RV32ZVE32F-NEXT: beqz t0, .LBB56_8 +; RV32ZVE32F-NEXT: .LBB56_16: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw a2, 0(a2) +; RV32ZVE32F-NEXT: .LBB56_17: # %else20 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: sw a5, 8(a0) +; RV32ZVE32F-NEXT: sw a4, 12(a0) +; RV32ZVE32F-NEXT: sw a7, 16(a0) +; RV32ZVE32F-NEXT: sw a6, 20(a0) +; RV32ZVE32F-NEXT: sw t2, 24(a0) +; RV32ZVE32F-NEXT: sw t1, 28(a0) +; RV32ZVE32F-NEXT: sw t4, 32(a0) +; RV32ZVE32F-NEXT: sw t3, 36(a0) +; RV32ZVE32F-NEXT: sw t6, 40(a0) +; RV32ZVE32F-NEXT: sw t5, 44(a0) +; RV32ZVE32F-NEXT: sw s1, 48(a0) +; RV32ZVE32F-NEXT: sw s0, 52(a0) +; RV32ZVE32F-NEXT: sw a2, 56(a0) +; RV32ZVE32F-NEXT: sw t0, 60(a0) +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i32_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi a3, a5, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB56_3 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 32 +; RV64ZVE32F-NEXT: srli a3, a3, 29 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: ld a3, 0(a3) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB56_4 +; RV64ZVE32F-NEXT: .LBB56_2: +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: j .LBB56_5 +; RV64ZVE32F-NEXT: .LBB56_3: +; RV64ZVE32F-NEXT: ld a3, 0(a2) +; RV64ZVE32F-NEXT: andi a4, a5, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB56_2 +; RV64ZVE32F-NEXT: .LBB56_4: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: slli a4, a4, 32 +; RV64ZVE32F-NEXT: srli a4, a4, 29 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: .LBB56_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB56_10 +; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: slli a6, a6, 32 +; RV64ZVE32F-NEXT: srli a6, a6, 29 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: bnez a7, .LBB56_11 +; RV64ZVE32F-NEXT: .LBB56_7: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB56_12 +; RV64ZVE32F-NEXT: .LBB56_8: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB56_13 +; RV64ZVE32F-NEXT: .LBB56_9: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB56_14 +; RV64ZVE32F-NEXT: .LBB56_10: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: beqz a7, .LBB56_7 +; RV64ZVE32F-NEXT: .LBB56_11: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: slli a7, a7, 32 +; RV64ZVE32F-NEXT: srli a7, a7, 29 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB56_8 +; RV64ZVE32F-NEXT: .LBB56_12: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t0, v10 +; RV64ZVE32F-NEXT: slli t0, t0, 32 +; RV64ZVE32F-NEXT: srli t0, t0, 29 +; RV64ZVE32F-NEXT: add t0, a1, t0 +; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB56_9 +; RV64ZVE32F-NEXT: .LBB56_13: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: slli t1, t1, 32 +; RV64ZVE32F-NEXT: srli t1, t1, 29 +; RV64ZVE32F-NEXT: add t1, a1, t1 +; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: .LBB56_14: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB56_17 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 32 +; RV64ZVE32F-NEXT: srli t2, t2, 29 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: ld t2, 0(t2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB56_18 +; RV64ZVE32F-NEXT: .LBB56_16: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB56_19 +; RV64ZVE32F-NEXT: .LBB56_17: +; RV64ZVE32F-NEXT: ld t2, 48(a2) +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB56_16 +; RV64ZVE32F-NEXT: .LBB56_18: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 32 +; RV64ZVE32F-NEXT: srli a2, a2, 29 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB56_19: # %else20 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 24(a0) +; RV64ZVE32F-NEXT: sd t0, 32(a0) +; RV64ZVE32F-NEXT: sd t1, 40(a0) +; RV64ZVE32F-NEXT: sd t2, 48(a0) +; RV64ZVE32F-NEXT: sd a1, 56(a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru) @@ -1235,24 +7228,314 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(i64* %base, <8 x i32> %idxs, } define <8 x i64> @mgather_baseidx_v8i64(i64* %base, <8 x i64> %idxs, <8 x i1> %m, <8 x i64> %passthru) { -; RV32-LABEL: mgather_baseidx_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsll.vi v8, v8, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsll.vi v8, v8, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsll.vi v8, v8, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsll.vi v8, v8, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -128 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 128 +; RV32ZVE32F-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 116(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s3, 112(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s4, 108(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s5, 104(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s6, 100(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s7, 96(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s8, 92(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s9, 88(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s10, 84(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s11, 80(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset ra, -4 +; RV32ZVE32F-NEXT: .cfi_offset s0, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: .cfi_offset s3, -16 +; RV32ZVE32F-NEXT: .cfi_offset s4, -20 +; RV32ZVE32F-NEXT: .cfi_offset s5, -24 +; RV32ZVE32F-NEXT: .cfi_offset s6, -28 +; RV32ZVE32F-NEXT: .cfi_offset s7, -32 +; RV32ZVE32F-NEXT: .cfi_offset s8, -36 +; RV32ZVE32F-NEXT: .cfi_offset s9, -40 +; RV32ZVE32F-NEXT: .cfi_offset s10, -44 +; RV32ZVE32F-NEXT: .cfi_offset s11, -48 +; RV32ZVE32F-NEXT: addi s0, sp, 128 +; RV32ZVE32F-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVE32F-NEXT: andi sp, sp, -32 +; RV32ZVE32F-NEXT: lw a4, 60(a3) +; RV32ZVE32F-NEXT: lw a5, 56(a3) +; RV32ZVE32F-NEXT: lw a6, 52(a3) +; RV32ZVE32F-NEXT: lw a7, 48(a3) +; RV32ZVE32F-NEXT: lw t0, 44(a3) +; RV32ZVE32F-NEXT: lw t1, 40(a3) +; RV32ZVE32F-NEXT: lw t2, 36(a3) +; RV32ZVE32F-NEXT: lw t3, 32(a3) +; RV32ZVE32F-NEXT: lw t4, 28(a3) +; RV32ZVE32F-NEXT: lw t5, 24(a3) +; RV32ZVE32F-NEXT: lw t6, 20(a3) +; RV32ZVE32F-NEXT: lw s2, 16(a3) +; RV32ZVE32F-NEXT: lw s3, 12(a3) +; RV32ZVE32F-NEXT: lw s5, 8(a3) +; RV32ZVE32F-NEXT: lw s4, 4(a3) +; RV32ZVE32F-NEXT: lw a3, 0(a3) +; RV32ZVE32F-NEXT: lw s6, 0(a2) +; RV32ZVE32F-NEXT: lw s7, 8(a2) +; RV32ZVE32F-NEXT: lw s8, 16(a2) +; RV32ZVE32F-NEXT: lw s9, 24(a2) +; RV32ZVE32F-NEXT: lw s10, 56(a2) +; RV32ZVE32F-NEXT: lw s11, 48(a2) +; RV32ZVE32F-NEXT: lw ra, 40(a2) +; RV32ZVE32F-NEXT: lw a2, 32(a2) +; RV32ZVE32F-NEXT: sw s10, 60(sp) +; RV32ZVE32F-NEXT: sw s11, 56(sp) +; RV32ZVE32F-NEXT: sw ra, 52(sp) +; RV32ZVE32F-NEXT: sw a2, 48(sp) +; RV32ZVE32F-NEXT: sw s9, 44(sp) +; RV32ZVE32F-NEXT: sw s8, 40(sp) +; RV32ZVE32F-NEXT: sw s7, 36(sp) +; RV32ZVE32F-NEXT: sw s6, 32(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: addi a2, sp, 32 +; RV32ZVE32F-NEXT: vle32.v v8, (a2) +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB57_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB57_11 +; RV32ZVE32F-NEXT: .LBB57_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB57_12 +; RV32ZVE32F-NEXT: .LBB57_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB57_13 +; RV32ZVE32F-NEXT: .LBB57_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB57_14 +; RV32ZVE32F-NEXT: .LBB57_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB57_15 +; RV32ZVE32F-NEXT: .LBB57_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB57_16 +; RV32ZVE32F-NEXT: .LBB57_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB57_9 +; RV32ZVE32F-NEXT: .LBB57_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: lw a4, 4(a1) +; RV32ZVE32F-NEXT: lw a5, 0(a1) +; RV32ZVE32F-NEXT: .LBB57_9: # %else20 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw s4, 4(a0) +; RV32ZVE32F-NEXT: sw s5, 8(a0) +; RV32ZVE32F-NEXT: sw s3, 12(a0) +; RV32ZVE32F-NEXT: sw s2, 16(a0) +; RV32ZVE32F-NEXT: sw t6, 20(a0) +; RV32ZVE32F-NEXT: sw t5, 24(a0) +; RV32ZVE32F-NEXT: sw t4, 28(a0) +; RV32ZVE32F-NEXT: sw t3, 32(a0) +; RV32ZVE32F-NEXT: sw t2, 36(a0) +; RV32ZVE32F-NEXT: sw t1, 40(a0) +; RV32ZVE32F-NEXT: sw t0, 44(a0) +; RV32ZVE32F-NEXT: sw a7, 48(a0) +; RV32ZVE32F-NEXT: sw a6, 52(a0) +; RV32ZVE32F-NEXT: sw a5, 56(a0) +; RV32ZVE32F-NEXT: sw a4, 60(a0) +; RV32ZVE32F-NEXT: addi sp, s0, -128 +; RV32ZVE32F-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 116(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s3, 112(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s4, 108(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s5, 104(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s6, 100(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s7, 96(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s8, 92(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s9, 88(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s10, 84(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s11, 80(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 128 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB57_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: lw s4, 4(a2) +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB57_2 +; RV32ZVE32F-NEXT: .LBB57_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: lw s3, 4(a2) +; RV32ZVE32F-NEXT: lw s5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB57_3 +; RV32ZVE32F-NEXT: .LBB57_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: lw t6, 4(a2) +; RV32ZVE32F-NEXT: lw s2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB57_4 +; RV32ZVE32F-NEXT: .LBB57_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: lw t4, 4(a2) +; RV32ZVE32F-NEXT: lw t5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB57_5 +; RV32ZVE32F-NEXT: .LBB57_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: lw t2, 4(a2) +; RV32ZVE32F-NEXT: lw t3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB57_6 +; RV32ZVE32F-NEXT: .LBB57_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: lw t0, 4(a2) +; RV32ZVE32F-NEXT: lw t1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB57_7 +; RV32ZVE32F-NEXT: .LBB57_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: lw a6, 4(a2) +; RV32ZVE32F-NEXT: lw a7, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB57_8 +; RV32ZVE32F-NEXT: j .LBB57_9 +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a7, v0 +; RV64ZVE32F-NEXT: andi a4, a7, 1 +; RV64ZVE32F-NEXT: beqz a4, .LBB57_9 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: ld a4, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a7, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB57_10 +; RV64ZVE32F-NEXT: .LBB57_2: +; RV64ZVE32F-NEXT: ld a5, 8(a3) +; RV64ZVE32F-NEXT: andi a6, a7, 4 +; RV64ZVE32F-NEXT: bnez a6, .LBB57_11 +; RV64ZVE32F-NEXT: .LBB57_3: +; RV64ZVE32F-NEXT: ld a6, 16(a3) +; RV64ZVE32F-NEXT: andi t0, a7, 8 +; RV64ZVE32F-NEXT: bnez t0, .LBB57_12 +; RV64ZVE32F-NEXT: .LBB57_4: +; RV64ZVE32F-NEXT: ld t0, 24(a3) +; RV64ZVE32F-NEXT: andi t1, a7, 16 +; RV64ZVE32F-NEXT: bnez t1, .LBB57_13 +; RV64ZVE32F-NEXT: .LBB57_5: +; RV64ZVE32F-NEXT: ld t1, 32(a3) +; RV64ZVE32F-NEXT: andi t2, a7, 32 +; RV64ZVE32F-NEXT: bnez t2, .LBB57_14 +; RV64ZVE32F-NEXT: .LBB57_6: +; RV64ZVE32F-NEXT: ld t2, 40(a3) +; RV64ZVE32F-NEXT: andi t3, a7, 64 +; RV64ZVE32F-NEXT: bnez t3, .LBB57_15 +; RV64ZVE32F-NEXT: .LBB57_7: +; RV64ZVE32F-NEXT: ld t3, 48(a3) +; RV64ZVE32F-NEXT: andi a7, a7, -128 +; RV64ZVE32F-NEXT: bnez a7, .LBB57_16 +; RV64ZVE32F-NEXT: .LBB57_8: +; RV64ZVE32F-NEXT: ld a1, 56(a3) +; RV64ZVE32F-NEXT: j .LBB57_17 +; RV64ZVE32F-NEXT: .LBB57_9: +; RV64ZVE32F-NEXT: ld a4, 0(a3) +; RV64ZVE32F-NEXT: andi a5, a7, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB57_2 +; RV64ZVE32F-NEXT: .LBB57_10: # %cond.load1 +; RV64ZVE32F-NEXT: ld a5, 8(a2) +; RV64ZVE32F-NEXT: slli a5, a5, 3 +; RV64ZVE32F-NEXT: add a5, a1, a5 +; RV64ZVE32F-NEXT: ld a5, 0(a5) +; RV64ZVE32F-NEXT: andi a6, a7, 4 +; RV64ZVE32F-NEXT: beqz a6, .LBB57_3 +; RV64ZVE32F-NEXT: .LBB57_11: # %cond.load4 +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: andi t0, a7, 8 +; RV64ZVE32F-NEXT: beqz t0, .LBB57_4 +; RV64ZVE32F-NEXT: .LBB57_12: # %cond.load7 +; RV64ZVE32F-NEXT: ld t0, 24(a2) +; RV64ZVE32F-NEXT: slli t0, t0, 3 +; RV64ZVE32F-NEXT: add t0, a1, t0 +; RV64ZVE32F-NEXT: ld t0, 0(t0) +; RV64ZVE32F-NEXT: andi t1, a7, 16 +; RV64ZVE32F-NEXT: beqz t1, .LBB57_5 +; RV64ZVE32F-NEXT: .LBB57_13: # %cond.load10 +; RV64ZVE32F-NEXT: ld t1, 32(a2) +; RV64ZVE32F-NEXT: slli t1, t1, 3 +; RV64ZVE32F-NEXT: add t1, a1, t1 +; RV64ZVE32F-NEXT: ld t1, 0(t1) +; RV64ZVE32F-NEXT: andi t2, a7, 32 +; RV64ZVE32F-NEXT: beqz t2, .LBB57_6 +; RV64ZVE32F-NEXT: .LBB57_14: # %cond.load13 +; RV64ZVE32F-NEXT: ld t2, 40(a2) +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: ld t2, 0(t2) +; RV64ZVE32F-NEXT: andi t3, a7, 64 +; RV64ZVE32F-NEXT: beqz t3, .LBB57_7 +; RV64ZVE32F-NEXT: .LBB57_15: # %cond.load16 +; RV64ZVE32F-NEXT: ld t3, 48(a2) +; RV64ZVE32F-NEXT: slli t3, t3, 3 +; RV64ZVE32F-NEXT: add t3, a1, t3 +; RV64ZVE32F-NEXT: ld t3, 0(t3) +; RV64ZVE32F-NEXT: andi a7, a7, -128 +; RV64ZVE32F-NEXT: beqz a7, .LBB57_8 +; RV64ZVE32F-NEXT: .LBB57_16: # %cond.load19 +; RV64ZVE32F-NEXT: ld a2, 56(a2) +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: .LBB57_17: # %else20 +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: sd a5, 8(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd t0, 24(a0) +; RV64ZVE32F-NEXT: sd t1, 32(a0) +; RV64ZVE32F-NEXT: sd t2, 40(a0) +; RV64ZVE32F-NEXT: sd t3, 48(a0) +; RV64ZVE32F-NEXT: sd a1, 56(a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %idxs %v = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %ptrs, i32 8, <8 x i1> %m, <8 x i64> %passthru) ret <8 x i64> %v @@ -1261,19 +7544,41 @@ define <8 x i64> @mgather_baseidx_v8i64(i64* %base, <8 x i64> %idxs, <8 x i1> %m declare <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*>, i32, <1 x i1>, <1 x half>) define <1 x half> @mgather_v1f16(<1 x half*> %ptrs, <1 x i1> %m, <1 x half> %passthru) { -; RV32-LABEL: mgather_v1f16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v1f16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv1r.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v1f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v1f16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v1f16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vmv1r.v v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v1f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB58_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: flh ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: .LBB58_2: # %else +; RV64ZVE32F-NEXT: ret %v = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> %ptrs, i32 2, <1 x i1> %m, <1 x half> %passthru) ret <1 x half> %v } @@ -1281,19 +7586,65 @@ define <1 x half> @mgather_v1f16(<1 x half*> %ptrs, <1 x i1> %m, <1 x half> %pas declare <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*>, i32, <2 x i1>, <2 x half>) define <2 x half> @mgather_v2f16(<2 x half*> %ptrs, <2 x i1> %m, <2 x half> %passthru) { -; RV32-LABEL: mgather_v2f16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2f16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv1r.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2f16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2f16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vmv1r.v v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB59_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB59_4 +; RV64ZVE32F-NEXT: .LBB59_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB59_3: # %cond.load +; RV64ZVE32F-NEXT: flh ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB59_2 +; RV64ZVE32F-NEXT: .LBB59_4: # %cond.load1 +; RV64ZVE32F-NEXT: flh ft0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> %ptrs, i32 2, <2 x i1> %m, <2 x half> %passthru) ret <2 x half> %v } @@ -1308,12 +7659,77 @@ define <4 x half> @mgather_v4f16(<4 x half*> %ptrs, <4 x i1> %m, <4 x half> %pas ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_v4f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v4f16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v4f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a1) +; RV64ZVE32F-NEXT: lbu a1, 15(sp) +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_6 +; RV64ZVE32F-NEXT: .LBB60_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_7 +; RV64ZVE32F-NEXT: .LBB60_3: # %else5 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB60_8 +; RV64ZVE32F-NEXT: .LBB60_4: # %else8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB60_5: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_2 +; RV64ZVE32F-NEXT: .LBB60_6: # %cond.load1 +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_3 +; RV64ZVE32F-NEXT: .LBB60_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB60_4 +; RV64ZVE32F-NEXT: .LBB60_8: # %cond.load7 +; RV64ZVE32F-NEXT: ld a0, 24(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %ptrs, i32 2, <4 x i1> %m, <4 x half> %passthru) ret <4 x half> %v } @@ -1326,12 +7742,77 @@ define <4 x half> @mgather_truemask_v4f16(<4 x half*> %ptrs, <4 x half> %passthr ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_truemask_v4f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8 -; RV64-NEXT: vmv1r.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_truemask_v4f16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8 +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_truemask_v4f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 8(a0) +; RV64ZVE32F-NEXT: ld a4, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB61_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB61_6 +; RV64ZVE32F-NEXT: .LBB61_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB61_7 +; RV64ZVE32F-NEXT: .LBB61_3: # %else5 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB61_8 +; RV64ZVE32F-NEXT: .LBB61_4: # %else8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB61_5: # %cond.load +; RV64ZVE32F-NEXT: flh ft0, 0(a4) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB61_2 +; RV64ZVE32F-NEXT: .LBB61_6: # %cond.load1 +; RV64ZVE32F-NEXT: flh ft0, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB61_3 +; RV64ZVE32F-NEXT: .LBB61_7: # %cond.load4 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB61_4 +; RV64ZVE32F-NEXT: .LBB61_8: # %cond.load7 +; RV64ZVE32F-NEXT: flh ft0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %ptrs, i32 2, <4 x i1> %mtrue, <4 x half> %passthru) @@ -1344,10 +7825,14 @@ define <4 x half> @mgather_falsemask_v4f16(<4 x half*> %ptrs, <4 x half> %passth ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_falsemask_v4f16: -; RV64: # %bb.0: -; RV64-NEXT: vmv1r.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_falsemask_v4f16: +; RV64V: # %bb.0: +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_falsemask_v4f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ret %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %ptrs, i32 2, <4 x i1> zeroinitializer, <4 x half> %passthru) ret <4 x half> %v } @@ -1362,12 +7847,111 @@ define <8 x half> @mgather_v8f16(<8 x half*> %ptrs, <8 x i1> %m, <8 x half> %pas ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_v8f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64-NEXT: vluxei64.v v12, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v8f16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64V-NEXT: vluxei64.v v12, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v8f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: bnez a2, .LBB63_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB63_10 +; RV64ZVE32F-NEXT: .LBB63_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB63_11 +; RV64ZVE32F-NEXT: .LBB63_3: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB63_12 +; RV64ZVE32F-NEXT: .LBB63_4: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB63_13 +; RV64ZVE32F-NEXT: .LBB63_5: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB63_14 +; RV64ZVE32F-NEXT: .LBB63_6: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: bnez a2, .LBB63_15 +; RV64ZVE32F-NEXT: .LBB63_7: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB63_16 +; RV64ZVE32F-NEXT: .LBB63_8: # %else20 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB63_9: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB63_2 +; RV64ZVE32F-NEXT: .LBB63_10: # %cond.load1 +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB63_3 +; RV64ZVE32F-NEXT: .LBB63_11: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB63_4 +; RV64ZVE32F-NEXT: .LBB63_12: # %cond.load7 +; RV64ZVE32F-NEXT: ld a2, 24(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB63_5 +; RV64ZVE32F-NEXT: .LBB63_13: # %cond.load10 +; RV64ZVE32F-NEXT: ld a2, 32(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB63_6 +; RV64ZVE32F-NEXT: .LBB63_14: # %cond.load13 +; RV64ZVE32F-NEXT: ld a2, 40(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 5 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: beqz a2, .LBB63_7 +; RV64ZVE32F-NEXT: .LBB63_15: # %cond.load16 +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB63_8 +; RV64ZVE32F-NEXT: .LBB63_16: # %cond.load19 +; RV64ZVE32F-NEXT: ld a0, 56(a0) +; RV64ZVE32F-NEXT: flh ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 7 +; RV64ZVE32F-NEXT: ret %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru) ret <8 x half> %v } @@ -1383,15 +7967,140 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(half* %base, <8 x i8> %idxs, <8 x ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i8_v8f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v12, v8 -; RV64-NEXT: vadd.vv v12, v12, v12 -; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i8_v8f16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vadd.vv v12, v12, v12 +; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i8_v8f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: .LBB64_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: .LBB64_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB64_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB64_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB64_14 +; RV64ZVE32F-NEXT: .LBB64_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_10 +; RV64ZVE32F-NEXT: .LBB64_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB64_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB64_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB64_16 +; RV64ZVE32F-NEXT: .LBB64_12: # %else20 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB64_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_8 +; RV64ZVE32F-NEXT: .LBB64_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB64_9 +; RV64ZVE32F-NEXT: j .LBB64_10 +; RV64ZVE32F-NEXT: .LBB64_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB64_12 +; RV64ZVE32F-NEXT: .LBB64_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: flh ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds half, half* %base, <8 x i8> %idxs %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru) ret <8 x half> %v @@ -1408,15 +8117,140 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(half* %base, <8 x i8> %idxs, ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i8_v8f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v12, v8 -; RV64-NEXT: vadd.vv v12, v12, v12 -; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i8_v8f16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vadd.vv v12, v12, v12 +; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: .LBB65_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: .LBB65_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB65_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB65_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB65_14 +; RV64ZVE32F-NEXT: .LBB65_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_10 +; RV64ZVE32F-NEXT: .LBB65_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB65_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB65_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB65_16 +; RV64ZVE32F-NEXT: .LBB65_12: # %else20 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB65_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_8 +; RV64ZVE32F-NEXT: .LBB65_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB65_9 +; RV64ZVE32F-NEXT: j .LBB65_10 +; RV64ZVE32F-NEXT: .LBB65_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB65_12 +; RV64ZVE32F-NEXT: .LBB65_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: flh ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %eidxs %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru) @@ -1434,15 +8268,148 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(half* %base, <8 x i8> %idxs, ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i8_v8f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf8 v12, v8 -; RV64-NEXT: vadd.vv v12, v12, v12 -; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8f16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf8 v12, v8 +; RV64V-NEXT: vadd.vv v12, v12, v12 +; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i8_v8f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: .LBB66_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: .LBB66_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB66_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB66_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB66_14 +; RV64ZVE32F-NEXT: .LBB66_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_10 +; RV64ZVE32F-NEXT: .LBB66_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB66_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB66_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB66_16 +; RV64ZVE32F-NEXT: .LBB66_12: # %else20 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB66_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_8 +; RV64ZVE32F-NEXT: .LBB66_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB66_9 +; RV64ZVE32F-NEXT: j .LBB66_10 +; RV64ZVE32F-NEXT: .LBB66_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB66_12 +; RV64ZVE32F-NEXT: .LBB66_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: andi a1, a1, 255 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: flh ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %eidxs %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru) @@ -1460,15 +8427,135 @@ define <8 x half> @mgather_baseidx_v8f16(half* %base, <8 x i16> %idxs, <8 x i1> ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf4 v12, v8 -; RV64-NEXT: vadd.vv v12, v12, v12 -; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8f16: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf4 v12, v8 +; RV64V-NEXT: vadd.vv v12, v12, v12 +; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: .LBB67_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: .LBB67_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v11, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB67_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB67_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB67_14 +; RV64ZVE32F-NEXT: .LBB67_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_10 +; RV64ZVE32F-NEXT: .LBB67_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB67_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB67_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB67_16 +; RV64ZVE32F-NEXT: .LBB67_12: # %else20 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB67_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_8 +; RV64ZVE32F-NEXT: .LBB67_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB67_9 +; RV64ZVE32F-NEXT: j .LBB67_10 +; RV64ZVE32F-NEXT: .LBB67_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh ft0, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB67_12 +; RV64ZVE32F-NEXT: .LBB67_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: flh ft0, 0(a0) +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %idxs %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru) ret <8 x half> %v @@ -1477,19 +8564,41 @@ define <8 x half> @mgather_baseidx_v8f16(half* %base, <8 x i16> %idxs, <8 x i1> declare <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*>, i32, <1 x i1>, <1 x float>) define <1 x float> @mgather_v1f32(<1 x float*> %ptrs, <1 x i1> %m, <1 x float> %passthru) { -; RV32-LABEL: mgather_v1f32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v1f32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv1r.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v1f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v1f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v1f32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vmv.v.v v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v1f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB68_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: flw ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: .LBB68_2: # %else +; RV64ZVE32F-NEXT: ret %v = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> %ptrs, i32 4, <1 x i1> %m, <1 x float> %passthru) ret <1 x float> %v } @@ -1497,19 +8606,65 @@ define <1 x float> @mgather_v1f32(<1 x float*> %ptrs, <1 x i1> %m, <1 x float> % declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>) define <2 x float> @mgather_v2f32(<2 x float*> %ptrs, <2 x i1> %m, <2 x float> %passthru) { -; RV32-LABEL: mgather_v2f32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv1r.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2f32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv1r.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv1r.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2f32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: vmv.v.v v8, v9 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB69_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB69_4 +; RV64ZVE32F-NEXT: .LBB69_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB69_3: # %cond.load +; RV64ZVE32F-NEXT: flw ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB69_2 +; RV64ZVE32F-NEXT: .LBB69_4: # %cond.load1 +; RV64ZVE32F-NEXT: flw ft0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %ptrs, i32 4, <2 x i1> %m, <2 x float> %passthru) ret <2 x float> %v } @@ -1524,12 +8679,77 @@ define <4 x float> @mgather_v4f32(<4 x float*> %ptrs, <4 x i1> %m, <4 x float> % ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_v4f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v4f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v4f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a1) +; RV64ZVE32F-NEXT: lbu a1, 15(sp) +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: bnez a2, .LBB70_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB70_6 +; RV64ZVE32F-NEXT: .LBB70_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB70_7 +; RV64ZVE32F-NEXT: .LBB70_3: # %else5 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB70_8 +; RV64ZVE32F-NEXT: .LBB70_4: # %else8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB70_5: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_2 +; RV64ZVE32F-NEXT: .LBB70_6: # %cond.load1 +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_3 +; RV64ZVE32F-NEXT: .LBB70_7: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB70_4 +; RV64ZVE32F-NEXT: .LBB70_8: # %cond.load7 +; RV64ZVE32F-NEXT: ld a0, 24(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> %m, <4 x float> %passthru) ret <4 x float> %v } @@ -1541,12 +8761,77 @@ define <4 x float> @mgather_truemask_v4f32(<4 x float*> %ptrs, <4 x float> %pass ; RV32-NEXT: vluxei32.v v8, (zero), v8 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_truemask_v4f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8 -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_truemask_v4f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8 +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_truemask_v4f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 8(a0) +; RV64ZVE32F-NEXT: ld a4, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB71_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB71_6 +; RV64ZVE32F-NEXT: .LBB71_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB71_7 +; RV64ZVE32F-NEXT: .LBB71_3: # %else5 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB71_8 +; RV64ZVE32F-NEXT: .LBB71_4: # %else8 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB71_5: # %cond.load +; RV64ZVE32F-NEXT: flw ft0, 0(a4) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB71_2 +; RV64ZVE32F-NEXT: .LBB71_6: # %cond.load1 +; RV64ZVE32F-NEXT: flw ft0, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB71_3 +; RV64ZVE32F-NEXT: .LBB71_7: # %cond.load4 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB71_4 +; RV64ZVE32F-NEXT: .LBB71_8: # %cond.load7 +; RV64ZVE32F-NEXT: flw ft0, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v9, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 3 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> %mtrue, <4 x float> %passthru) @@ -1559,10 +8844,14 @@ define <4 x float> @mgather_falsemask_v4f32(<4 x float*> %ptrs, <4 x float> %pas ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_falsemask_v4f32: -; RV64: # %bb.0: -; RV64-NEXT: vmv1r.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_falsemask_v4f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vmv1r.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_falsemask_v4f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ret %v = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> zeroinitializer, <4 x float> %passthru) ret <4 x float> %v } @@ -1577,12 +8866,111 @@ define <8 x float> @mgather_v8f32(<8 x float*> %ptrs, <8 x i1> %m, <8 x float> % ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_v8f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v12, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v8f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v12, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: bnez a2, .LBB73_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB73_10 +; RV64ZVE32F-NEXT: .LBB73_2: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB73_11 +; RV64ZVE32F-NEXT: .LBB73_3: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB73_12 +; RV64ZVE32F-NEXT: .LBB73_4: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB73_13 +; RV64ZVE32F-NEXT: .LBB73_5: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB73_14 +; RV64ZVE32F-NEXT: .LBB73_6: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: bnez a2, .LBB73_15 +; RV64ZVE32F-NEXT: .LBB73_7: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB73_16 +; RV64ZVE32F-NEXT: .LBB73_8: # %else20 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB73_9: # %cond.load +; RV64ZVE32F-NEXT: ld a2, 0(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB73_2 +; RV64ZVE32F-NEXT: .LBB73_10: # %cond.load1 +; RV64ZVE32F-NEXT: ld a2, 8(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB73_3 +; RV64ZVE32F-NEXT: .LBB73_11: # %cond.load4 +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB73_4 +; RV64ZVE32F-NEXT: .LBB73_12: # %cond.load7 +; RV64ZVE32F-NEXT: ld a2, 24(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB73_5 +; RV64ZVE32F-NEXT: .LBB73_13: # %cond.load10 +; RV64ZVE32F-NEXT: ld a2, 32(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB73_6 +; RV64ZVE32F-NEXT: .LBB73_14: # %cond.load13 +; RV64ZVE32F-NEXT: ld a2, 40(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 5 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: beqz a2, .LBB73_7 +; RV64ZVE32F-NEXT: .LBB73_15: # %cond.load16 +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB73_8 +; RV64ZVE32F-NEXT: .LBB73_16: # %cond.load19 +; RV64ZVE32F-NEXT: ld a0, 56(a0) +; RV64ZVE32F-NEXT: flw ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 7 +; RV64ZVE32F-NEXT: ret %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) ret <8 x float> %v } @@ -1597,15 +8985,140 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(float* %base, <8 x i8> %idxs, <8 ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i8_v8f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i8_v8f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i8_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: .LBB74_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB74_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB74_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_14 +; RV64ZVE32F-NEXT: .LBB74_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_10 +; RV64ZVE32F-NEXT: .LBB74_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB74_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB74_16 +; RV64ZVE32F-NEXT: .LBB74_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB74_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_8 +; RV64ZVE32F-NEXT: .LBB74_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_9 +; RV64ZVE32F-NEXT: j .LBB74_10 +; RV64ZVE32F-NEXT: .LBB74_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB74_12 +; RV64ZVE32F-NEXT: .LBB74_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: flw ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds float, float* %base, <8 x i8> %idxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) ret <8 x float> %v @@ -1621,15 +9134,140 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(float* %base, <8 x i8> %idxs ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i8_v8f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i8_v8f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB75_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: .LBB75_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB75_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB75_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB75_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB75_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB75_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB75_14 +; RV64ZVE32F-NEXT: .LBB75_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB75_10 +; RV64ZVE32F-NEXT: .LBB75_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB75_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB75_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB75_16 +; RV64ZVE32F-NEXT: .LBB75_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB75_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB75_8 +; RV64ZVE32F-NEXT: .LBB75_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB75_9 +; RV64ZVE32F-NEXT: j .LBB75_10 +; RV64ZVE32F-NEXT: .LBB75_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB75_12 +; RV64ZVE32F-NEXT: .LBB75_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: flw ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) @@ -1646,15 +9284,148 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(float* %base, <8 x i8> %idxs ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i8_v8f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf8 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf8 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i8_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB76_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: .LBB76_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB76_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB76_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB76_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB76_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB76_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB76_14 +; RV64ZVE32F-NEXT: .LBB76_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB76_10 +; RV64ZVE32F-NEXT: .LBB76_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB76_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB76_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB76_16 +; RV64ZVE32F-NEXT: .LBB76_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB76_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB76_8 +; RV64ZVE32F-NEXT: .LBB76_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB76_9 +; RV64ZVE32F-NEXT: j .LBB76_10 +; RV64ZVE32F-NEXT: .LBB76_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB76_12 +; RV64ZVE32F-NEXT: .LBB76_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: andi a1, a1, 255 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: flw ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) @@ -1671,15 +9442,141 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(float* %base, <8 x i16> %idxs, < ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i16_v8f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf4 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i16_v8f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf4 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i16_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB77_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: .LBB77_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB77_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB77_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB77_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB77_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB77_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB77_14 +; RV64ZVE32F-NEXT: .LBB77_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB77_10 +; RV64ZVE32F-NEXT: .LBB77_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB77_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB77_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB77_16 +; RV64ZVE32F-NEXT: .LBB77_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB77_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB77_8 +; RV64ZVE32F-NEXT: .LBB77_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB77_9 +; RV64ZVE32F-NEXT: j .LBB77_10 +; RV64ZVE32F-NEXT: .LBB77_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB77_12 +; RV64ZVE32F-NEXT: .LBB77_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: flw ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds float, float* %base, <8 x i16> %idxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) ret <8 x float> %v @@ -1695,15 +9592,141 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(float* %base, <8 x i16> %id ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i16_v8f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf4 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i16_v8f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf4 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i16_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: .LBB78_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB78_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB78_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB78_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB78_14 +; RV64ZVE32F-NEXT: .LBB78_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_10 +; RV64ZVE32F-NEXT: .LBB78_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB78_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB78_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB78_16 +; RV64ZVE32F-NEXT: .LBB78_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB78_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_8 +; RV64ZVE32F-NEXT: .LBB78_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB78_9 +; RV64ZVE32F-NEXT: j .LBB78_10 +; RV64ZVE32F-NEXT: .LBB78_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB78_12 +; RV64ZVE32F-NEXT: .LBB78_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: flw ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) @@ -1720,15 +9743,151 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(float* %base, <8 x i16> %id ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i16_v8f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf4 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf4 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: lui a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: addiw a1, a1, -1 +; RV64ZVE32F-NEXT: beqz a3, .LBB79_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: flw ft0, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: .LBB79_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB79_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: flw ft0, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB79_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB79_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: flw ft0, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB79_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB79_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB79_14 +; RV64ZVE32F-NEXT: .LBB79_8: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB79_10 +; RV64ZVE32F-NEXT: .LBB79_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: flw ft0, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB79_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB79_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: bnez a2, .LBB79_16 +; RV64ZVE32F-NEXT: .LBB79_12: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB79_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: flw ft0, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB79_8 +; RV64ZVE32F-NEXT: .LBB79_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: flw ft0, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB79_9 +; RV64ZVE32F-NEXT: j .LBB79_10 +; RV64ZVE32F-NEXT: .LBB79_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: flw ft0, 0(a3) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB79_12 +; RV64ZVE32F-NEXT: .LBB79_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: and a1, a2, a1 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: flw ft0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) @@ -1744,15 +9903,137 @@ define <8 x float> @mgather_baseidx_v8f32(float* %base, <8 x i32> %idxs, <8 x i1 ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf2 v12, v8 -; RV64-NEXT: vsll.vi v12, v12, 2 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v12, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8f32: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf2 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB80_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vfmv.s.f v10, ft0 +; RV64ZVE32F-NEXT: .LBB80_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB80_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB80_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB80_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB80_13 +; RV64ZVE32F-NEXT: .LBB80_6: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB80_14 +; RV64ZVE32F-NEXT: .LBB80_7: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB80_9 +; RV64ZVE32F-NEXT: .LBB80_8: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB80_9: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB80_15 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB80_16 +; RV64ZVE32F-NEXT: .LBB80_11: # %else20 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB80_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v14, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB80_6 +; RV64ZVE32F-NEXT: .LBB80_13: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB80_7 +; RV64ZVE32F-NEXT: .LBB80_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB80_8 +; RV64ZVE32F-NEXT: j .LBB80_9 +; RV64ZVE32F-NEXT: .LBB80_15: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw ft0, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v12, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB80_11 +; RV64ZVE32F-NEXT: .LBB80_16: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: flw ft0, 0(a0) +; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %idxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) ret <8 x float> %v @@ -1761,19 +10042,47 @@ define <8 x float> @mgather_baseidx_v8f32(float* %base, <8 x i32> %idxs, <8 x i1 declare <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*>, i32, <1 x i1>, <1 x double>) define <1 x double> @mgather_v1f64(<1 x double*> %ptrs, <1 x i1> %m, <1 x double> %passthru) { -; RV32-LABEL: mgather_v1f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v1f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v1f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v1f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v1f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 +; RV32ZVE32F-NEXT: andi a0, a0, 1 +; RV32ZVE32F-NEXT: beqz a0, .LBB81_2 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.load +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a0) +; RV32ZVE32F-NEXT: .LBB81_2: # %else +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v1f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB81_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: fld fa0, 0(a0) +; RV64ZVE32F-NEXT: .LBB81_2: # %else +; RV64ZVE32F-NEXT: ret %v = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> %ptrs, i32 8, <1 x i1> %m, <1 x double> %passthru) ret <1 x double> %v } @@ -1781,19 +10090,90 @@ define <1 x double> @mgather_v1f64(<1 x double*> %ptrs, <1 x i1> %m, <1 x double declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>) define <2 x double> @mgather_v2f64(<2 x double*> %ptrs, <2 x i1> %m, <2 x double> %passthru) { -; RV32-LABEL: mgather_v2f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v9 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v2f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32V-NEXT: vluxei32.v v9, (zero), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v9 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v2f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v2f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v2f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi a0, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (a0) +; RV32ZVE32F-NEXT: lbu a0, 15(sp) +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB82_3 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a0, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB82_4 +; RV32ZVE32F-NEXT: .LBB82_2: # %else2 +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB82_3: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB82_2 +; RV32ZVE32F-NEXT: .LBB82_4: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fld fa1, 0(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_v2f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB82_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB82_4 +; RV64ZVE32F-NEXT: .LBB82_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB82_3: # %cond.load +; RV64ZVE32F-NEXT: fld fa0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB82_2 +; RV64ZVE32F-NEXT: .LBB82_4: # %cond.load1 +; RV64ZVE32F-NEXT: fld fa1, 0(a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %v = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 8, <2 x i1> %m, <2 x double> %passthru) ret <2 x double> %v } @@ -1801,36 +10181,266 @@ define <2 x double> @mgather_v2f64(<2 x double*> %ptrs, <2 x i1> %m, <2 x double declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>) define <4 x double> @mgather_v4f64(<4 x double*> %ptrs, <4 x i1> %m, <4 x double> %passthru) { -; RV32-LABEL: mgather_v4f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vluxei32.v v10, (zero), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v10 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v4f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32V-NEXT: vluxei32.v v10, (zero), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v10 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v4f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vluxei64.v v10, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v4f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64V-NEXT: vluxei64.v v10, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v10 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v4f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi a1, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (a1) +; RV32ZVE32F-NEXT: lbu a1, 15(sp) +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB83_6 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB83_7 +; RV32ZVE32F-NEXT: .LBB83_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB83_8 +; RV32ZVE32F-NEXT: .LBB83_3: # %else5 +; RV32ZVE32F-NEXT: andi a1, a1, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB83_5 +; RV32ZVE32F-NEXT: .LBB83_4: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa3, 0(a1) +; RV32ZVE32F-NEXT: .LBB83_5: # %else8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB83_6: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB83_2 +; RV32ZVE32F-NEXT: .LBB83_7: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v9 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB83_3 +; RV32ZVE32F-NEXT: .LBB83_8: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v9 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB83_4 +; RV32ZVE32F-NEXT: j .LBB83_5 +; +; RV64ZVE32F-LABEL: mgather_v4f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB83_6 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB83_7 +; RV64ZVE32F-NEXT: .LBB83_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB83_8 +; RV64ZVE32F-NEXT: .LBB83_3: # %else5 +; RV64ZVE32F-NEXT: andi a2, a2, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_5 +; RV64ZVE32F-NEXT: .LBB83_4: # %cond.load7 +; RV64ZVE32F-NEXT: ld a1, 24(a1) +; RV64ZVE32F-NEXT: fld fa3, 0(a1) +; RV64ZVE32F-NEXT: .LBB83_5: # %else8 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB83_6: # %cond.load +; RV64ZVE32F-NEXT: ld a3, 0(a1) +; RV64ZVE32F-NEXT: fld fa0, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB83_2 +; RV64ZVE32F-NEXT: .LBB83_7: # %cond.load1 +; RV64ZVE32F-NEXT: ld a3, 8(a1) +; RV64ZVE32F-NEXT: fld fa1, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB83_3 +; RV64ZVE32F-NEXT: .LBB83_8: # %cond.load4 +; RV64ZVE32F-NEXT: ld a3, 16(a1) +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a2, a2, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB83_4 +; RV64ZVE32F-NEXT: j .LBB83_5 %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32 8, <4 x i1> %m, <4 x double> %passthru) ret <4 x double> %v } define <4 x double> @mgather_truemask_v4f64(<4 x double*> %ptrs, <4 x double> %passthru) { -; RV32-LABEL: mgather_truemask_v4f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vluxei32.v v10, (zero), v8 -; RV32-NEXT: vmv.v.v v8, v10 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_truemask_v4f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32V-NEXT: vluxei32.v v10, (zero), v8 +; RV32V-NEXT: vmv.v.v v8, v10 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_truemask_v4f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vluxei64.v v8, (zero), v8 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_truemask_v4f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64V-NEXT: vluxei64.v v8, (zero), v8 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_truemask_v4f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmset.m v0 +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi a1, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (a1) +; RV32ZVE32F-NEXT: lb a1, 15(sp) +; RV32ZVE32F-NEXT: beqz zero, .LBB84_6 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB84_7 +; RV32ZVE32F-NEXT: .LBB84_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB84_8 +; RV32ZVE32F-NEXT: .LBB84_3: # %else5 +; RV32ZVE32F-NEXT: andi a1, a1, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB84_5 +; RV32ZVE32F-NEXT: .LBB84_4: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa3, 0(a1) +; RV32ZVE32F-NEXT: .LBB84_5: # %else8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB84_6: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB84_2 +; RV32ZVE32F-NEXT: .LBB84_7: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v9 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB84_3 +; RV32ZVE32F-NEXT: .LBB84_8: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v9 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB84_4 +; RV32ZVE32F-NEXT: j .LBB84_5 +; +; RV64ZVE32F-LABEL: mgather_truemask_v4f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a2, 24(a1) +; RV64ZVE32F-NEXT: ld a3, 16(a1) +; RV64ZVE32F-NEXT: ld a4, 8(a1) +; RV64ZVE32F-NEXT: ld a5, 0(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a1) +; RV64ZVE32F-NEXT: lb a1, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB84_6 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a5, a1, 2 +; RV64ZVE32F-NEXT: bnez a5, .LBB84_7 +; RV64ZVE32F-NEXT: .LBB84_2: # %else2 +; RV64ZVE32F-NEXT: andi a4, a1, 4 +; RV64ZVE32F-NEXT: bnez a4, .LBB84_8 +; RV64ZVE32F-NEXT: .LBB84_3: # %else5 +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB84_5 +; RV64ZVE32F-NEXT: .LBB84_4: # %cond.load7 +; RV64ZVE32F-NEXT: fld fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB84_5: # %else8 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB84_6: # %cond.load +; RV64ZVE32F-NEXT: fld fa0, 0(a5) +; RV64ZVE32F-NEXT: andi a5, a1, 2 +; RV64ZVE32F-NEXT: beqz a5, .LBB84_2 +; RV64ZVE32F-NEXT: .LBB84_7: # %cond.load1 +; RV64ZVE32F-NEXT: fld fa1, 0(a4) +; RV64ZVE32F-NEXT: andi a4, a1, 4 +; RV64ZVE32F-NEXT: beqz a4, .LBB84_3 +; RV64ZVE32F-NEXT: .LBB84_8: # %cond.load4 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a1, a1, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB84_4 +; RV64ZVE32F-NEXT: j .LBB84_5 %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32 8, <4 x i1> %mtrue, <4 x double> %passthru) @@ -1838,10 +10448,31 @@ define <4 x double> @mgather_truemask_v4f64(<4 x double*> %ptrs, <4 x double> %p } define <4 x double> @mgather_falsemask_v4f64(<4 x double*> %ptrs, <4 x double> %passthru) { -; CHECK-LABEL: mgather_falsemask_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vmv2r.v v8, v10 -; CHECK-NEXT: ret +; RV32V-LABEL: mgather_falsemask_v4f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vmv2r.v v8, v10 +; RV32V-NEXT: ret +; +; RV64V-LABEL: mgather_falsemask_v4f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vmv2r.v v8, v10 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_falsemask_v4f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_falsemask_v4f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: ret %v = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %ptrs, i32 8, <4 x i1> zeroinitializer, <4 x double> %passthru) ret <4 x double> %v } @@ -1849,68 +10480,625 @@ define <4 x double> @mgather_falsemask_v4f64(<4 x double*> %ptrs, <4 x double> % declare <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*>, i32, <8 x i1>, <8 x double>) define <8 x double> @mgather_v8f64(<8 x double*> %ptrs, <8 x i1> %m, <8 x double> %passthru) { -; RV32-LABEL: mgather_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (zero), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (zero), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vluxei64.v v12, (zero), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_v8f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vluxei64.v v12, (zero), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB86_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB86_11 +; RV32ZVE32F-NEXT: .LBB86_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB86_12 +; RV32ZVE32F-NEXT: .LBB86_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB86_13 +; RV32ZVE32F-NEXT: .LBB86_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB86_14 +; RV32ZVE32F-NEXT: .LBB86_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB86_15 +; RV32ZVE32F-NEXT: .LBB86_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB86_16 +; RV32ZVE32F-NEXT: .LBB86_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB86_9 +; RV32ZVE32F-NEXT: .LBB86_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa7, 0(a1) +; RV32ZVE32F-NEXT: .LBB86_9: # %else20 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa4, 32(a0) +; RV32ZVE32F-NEXT: fsd fa5, 40(a0) +; RV32ZVE32F-NEXT: fsd fa6, 48(a0) +; RV32ZVE32F-NEXT: fsd fa7, 56(a0) +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB86_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB86_2 +; RV32ZVE32F-NEXT: .LBB86_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB86_3 +; RV32ZVE32F-NEXT: .LBB86_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB86_4 +; RV32ZVE32F-NEXT: .LBB86_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB86_5 +; RV32ZVE32F-NEXT: .LBB86_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB86_6 +; RV32ZVE32F-NEXT: .LBB86_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB86_7 +; RV32ZVE32F-NEXT: .LBB86_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB86_8 +; RV32ZVE32F-NEXT: j .LBB86_9 +; +; RV64ZVE32F-LABEL: mgather_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_10 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_11 +; RV64ZVE32F-NEXT: .LBB86_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_12 +; RV64ZVE32F-NEXT: .LBB86_3: # %else5 +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_13 +; RV64ZVE32F-NEXT: .LBB86_4: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_14 +; RV64ZVE32F-NEXT: .LBB86_5: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_15 +; RV64ZVE32F-NEXT: .LBB86_6: # %else14 +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_16 +; RV64ZVE32F-NEXT: .LBB86_7: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB86_9 +; RV64ZVE32F-NEXT: .LBB86_8: # %cond.load19 +; RV64ZVE32F-NEXT: ld a1, 56(a1) +; RV64ZVE32F-NEXT: fld fa7, 0(a1) +; RV64ZVE32F-NEXT: .LBB86_9: # %else20 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa4, 32(a0) +; RV64ZVE32F-NEXT: fsd fa5, 40(a0) +; RV64ZVE32F-NEXT: fsd fa6, 48(a0) +; RV64ZVE32F-NEXT: fsd fa7, 56(a0) +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB86_10: # %cond.load +; RV64ZVE32F-NEXT: ld a3, 0(a1) +; RV64ZVE32F-NEXT: fld fa0, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_2 +; RV64ZVE32F-NEXT: .LBB86_11: # %cond.load1 +; RV64ZVE32F-NEXT: ld a3, 8(a1) +; RV64ZVE32F-NEXT: fld fa1, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_3 +; RV64ZVE32F-NEXT: .LBB86_12: # %cond.load4 +; RV64ZVE32F-NEXT: ld a3, 16(a1) +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_4 +; RV64ZVE32F-NEXT: .LBB86_13: # %cond.load7 +; RV64ZVE32F-NEXT: ld a3, 24(a1) +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_5 +; RV64ZVE32F-NEXT: .LBB86_14: # %cond.load10 +; RV64ZVE32F-NEXT: ld a3, 32(a1) +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_6 +; RV64ZVE32F-NEXT: .LBB86_15: # %cond.load13 +; RV64ZVE32F-NEXT: ld a3, 40(a1) +; RV64ZVE32F-NEXT: fld fa5, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_7 +; RV64ZVE32F-NEXT: .LBB86_16: # %cond.load16 +; RV64ZVE32F-NEXT: ld a3, 48(a1) +; RV64ZVE32F-NEXT: fld fa6, 0(a3) +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: bnez a2, .LBB86_8 +; RV64ZVE32F-NEXT: j .LBB86_9 %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v } define <8 x double> @mgather_baseidx_v8i8_v8f64(double* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x double> %passthru) { -; RV32-LABEL: mgather_baseidx_v8i8_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsext.vf4 v10, v8 -; RV32-NEXT: vsll.vi v8, v10, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_v8i8_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsext.vf4 v10, v8 +; RV32V-NEXT: vsll.vi v8, v10, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i8_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i8_v8f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_v8i8_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB87_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB87_11 +; RV32ZVE32F-NEXT: .LBB87_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB87_12 +; RV32ZVE32F-NEXT: .LBB87_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB87_13 +; RV32ZVE32F-NEXT: .LBB87_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB87_14 +; RV32ZVE32F-NEXT: .LBB87_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB87_15 +; RV32ZVE32F-NEXT: .LBB87_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB87_16 +; RV32ZVE32F-NEXT: .LBB87_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB87_9 +; RV32ZVE32F-NEXT: .LBB87_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa7, 0(a1) +; RV32ZVE32F-NEXT: .LBB87_9: # %else20 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa4, 32(a0) +; RV32ZVE32F-NEXT: fsd fa5, 40(a0) +; RV32ZVE32F-NEXT: fsd fa6, 48(a0) +; RV32ZVE32F-NEXT: fsd fa7, 56(a0) +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB87_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB87_2 +; RV32ZVE32F-NEXT: .LBB87_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB87_3 +; RV32ZVE32F-NEXT: .LBB87_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB87_4 +; RV32ZVE32F-NEXT: .LBB87_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB87_5 +; RV32ZVE32F-NEXT: .LBB87_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB87_6 +; RV32ZVE32F-NEXT: .LBB87_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB87_7 +; RV32ZVE32F-NEXT: .LBB87_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB87_8 +; RV32ZVE32F-NEXT: j .LBB87_9 +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i8_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa0, 0(a3) +; RV64ZVE32F-NEXT: .LBB87_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa1, 0(a3) +; RV64ZVE32F-NEXT: .LBB87_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB87_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB87_15 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB87_16 +; RV64ZVE32F-NEXT: .LBB87_8: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_10 +; RV64ZVE32F-NEXT: .LBB87_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa5, 0(a3) +; RV64ZVE32F-NEXT: .LBB87_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa6, 0(a3) +; RV64ZVE32F-NEXT: .LBB87_12: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: fld fa7, 0(a1) +; RV64ZVE32F-NEXT: .LBB87_14: # %else20 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa4, 32(a0) +; RV64ZVE32F-NEXT: fsd fa5, 40(a0) +; RV64ZVE32F-NEXT: fsd fa6, 48(a0) +; RV64ZVE32F-NEXT: fsd fa7, 56(a0) +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB87_15: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_8 +; RV64ZVE32F-NEXT: .LBB87_16: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB87_9 +; RV64ZVE32F-NEXT: j .LBB87_10 %ptrs = getelementptr inbounds double, double* %base, <8 x i8> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v } define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(double* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x double> %passthru) { -; RV32-LABEL: mgather_baseidx_sext_v8i8_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf8 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_sext_v8i8_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf8 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i8_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf8 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i8_v8f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB88_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB88_11 +; RV32ZVE32F-NEXT: .LBB88_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB88_12 +; RV32ZVE32F-NEXT: .LBB88_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB88_13 +; RV32ZVE32F-NEXT: .LBB88_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB88_14 +; RV32ZVE32F-NEXT: .LBB88_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB88_15 +; RV32ZVE32F-NEXT: .LBB88_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB88_16 +; RV32ZVE32F-NEXT: .LBB88_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB88_9 +; RV32ZVE32F-NEXT: .LBB88_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa7, 0(a1) +; RV32ZVE32F-NEXT: .LBB88_9: # %else20 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa4, 32(a0) +; RV32ZVE32F-NEXT: fsd fa5, 40(a0) +; RV32ZVE32F-NEXT: fsd fa6, 48(a0) +; RV32ZVE32F-NEXT: fsd fa7, 56(a0) +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB88_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB88_2 +; RV32ZVE32F-NEXT: .LBB88_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB88_3 +; RV32ZVE32F-NEXT: .LBB88_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB88_4 +; RV32ZVE32F-NEXT: .LBB88_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB88_5 +; RV32ZVE32F-NEXT: .LBB88_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB88_6 +; RV32ZVE32F-NEXT: .LBB88_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB88_7 +; RV32ZVE32F-NEXT: .LBB88_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB88_8 +; RV32ZVE32F-NEXT: j .LBB88_9 +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa0, 0(a3) +; RV64ZVE32F-NEXT: .LBB88_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa1, 0(a3) +; RV64ZVE32F-NEXT: .LBB88_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB88_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB88_15 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB88_16 +; RV64ZVE32F-NEXT: .LBB88_8: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_10 +; RV64ZVE32F-NEXT: .LBB88_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa5, 0(a3) +; RV64ZVE32F-NEXT: .LBB88_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa6, 0(a3) +; RV64ZVE32F-NEXT: .LBB88_12: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: fld fa7, 0(a1) +; RV64ZVE32F-NEXT: .LBB88_14: # %else20 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa4, 32(a0) +; RV64ZVE32F-NEXT: fsd fa5, 40(a0) +; RV64ZVE32F-NEXT: fsd fa6, 48(a0) +; RV64ZVE32F-NEXT: fsd fa7, 56(a0) +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB88_15: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_8 +; RV64ZVE32F-NEXT: .LBB88_16: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB88_9 +; RV64ZVE32F-NEXT: j .LBB88_10 %eidxs = sext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -1918,26 +11106,228 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(double* %base, <8 x i8> %id } define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(double* %base, <8 x i8> %idxs, <8 x i1> %m, <8 x double> %passthru) { -; RV32-LABEL: mgather_baseidx_zext_v8i8_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf8 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_zext_v8i8_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf8 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i8_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf8 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf8 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_zext_v8i8_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vzext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB89_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB89_11 +; RV32ZVE32F-NEXT: .LBB89_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB89_12 +; RV32ZVE32F-NEXT: .LBB89_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB89_13 +; RV32ZVE32F-NEXT: .LBB89_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB89_14 +; RV32ZVE32F-NEXT: .LBB89_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB89_15 +; RV32ZVE32F-NEXT: .LBB89_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB89_16 +; RV32ZVE32F-NEXT: .LBB89_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB89_9 +; RV32ZVE32F-NEXT: .LBB89_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa7, 0(a1) +; RV32ZVE32F-NEXT: .LBB89_9: # %else20 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa4, 32(a0) +; RV32ZVE32F-NEXT: fsd fa5, 40(a0) +; RV32ZVE32F-NEXT: fsd fa6, 48(a0) +; RV32ZVE32F-NEXT: fsd fa7, 56(a0) +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB89_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB89_2 +; RV32ZVE32F-NEXT: .LBB89_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB89_3 +; RV32ZVE32F-NEXT: .LBB89_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB89_4 +; RV32ZVE32F-NEXT: .LBB89_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB89_5 +; RV32ZVE32F-NEXT: .LBB89_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB89_6 +; RV32ZVE32F-NEXT: .LBB89_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB89_7 +; RV32ZVE32F-NEXT: .LBB89_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB89_8 +; RV32ZVE32F-NEXT: j .LBB89_9 +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i8_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: andi a3, a3, 255 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa0, 0(a3) +; RV64ZVE32F-NEXT: .LBB89_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: andi a3, a3, 255 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa1, 0(a3) +; RV64ZVE32F-NEXT: .LBB89_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: andi a3, a3, 255 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB89_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB89_15 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB89_16 +; RV64ZVE32F-NEXT: .LBB89_8: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_10 +; RV64ZVE32F-NEXT: .LBB89_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: andi a3, a3, 255 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa5, 0(a3) +; RV64ZVE32F-NEXT: .LBB89_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: andi a3, a3, 255 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa6, 0(a3) +; RV64ZVE32F-NEXT: .LBB89_12: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: fld fa7, 0(a1) +; RV64ZVE32F-NEXT: .LBB89_14: # %else20 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa4, 32(a0) +; RV64ZVE32F-NEXT: fsd fa5, 40(a0) +; RV64ZVE32F-NEXT: fsd fa6, 48(a0) +; RV64ZVE32F-NEXT: fsd fa7, 56(a0) +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB89_15: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: andi a3, a3, 255 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_8 +; RV64ZVE32F-NEXT: .LBB89_16: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: andi a3, a3, 255 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB89_9 +; RV64ZVE32F-NEXT: j .LBB89_10 %eidxs = zext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -1945,50 +11335,440 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(double* %base, <8 x i8> %id } define <8 x double> @mgather_baseidx_v8i16_v8f64(double* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) { -; RV32-LABEL: mgather_baseidx_v8i16_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsext.vf2 v10, v8 -; RV32-NEXT: vsll.vi v8, v10, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_v8i16_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsext.vf2 v10, v8 +; RV32V-NEXT: vsll.vi v8, v10, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i16_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf4 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i16_v8f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf4 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_v8i16_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB90_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB90_11 +; RV32ZVE32F-NEXT: .LBB90_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB90_12 +; RV32ZVE32F-NEXT: .LBB90_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB90_13 +; RV32ZVE32F-NEXT: .LBB90_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB90_14 +; RV32ZVE32F-NEXT: .LBB90_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB90_15 +; RV32ZVE32F-NEXT: .LBB90_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB90_16 +; RV32ZVE32F-NEXT: .LBB90_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB90_9 +; RV32ZVE32F-NEXT: .LBB90_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa7, 0(a1) +; RV32ZVE32F-NEXT: .LBB90_9: # %else20 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa4, 32(a0) +; RV32ZVE32F-NEXT: fsd fa5, 40(a0) +; RV32ZVE32F-NEXT: fsd fa6, 48(a0) +; RV32ZVE32F-NEXT: fsd fa7, 56(a0) +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB90_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB90_2 +; RV32ZVE32F-NEXT: .LBB90_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB90_3 +; RV32ZVE32F-NEXT: .LBB90_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB90_4 +; RV32ZVE32F-NEXT: .LBB90_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB90_5 +; RV32ZVE32F-NEXT: .LBB90_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB90_6 +; RV32ZVE32F-NEXT: .LBB90_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB90_7 +; RV32ZVE32F-NEXT: .LBB90_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB90_8 +; RV32ZVE32F-NEXT: j .LBB90_9 +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i16_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa0, 0(a3) +; RV64ZVE32F-NEXT: .LBB90_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa1, 0(a3) +; RV64ZVE32F-NEXT: .LBB90_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB90_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB90_15 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB90_16 +; RV64ZVE32F-NEXT: .LBB90_8: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_10 +; RV64ZVE32F-NEXT: .LBB90_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa5, 0(a3) +; RV64ZVE32F-NEXT: .LBB90_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa6, 0(a3) +; RV64ZVE32F-NEXT: .LBB90_12: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB90_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: fld fa7, 0(a1) +; RV64ZVE32F-NEXT: .LBB90_14: # %else20 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa4, 32(a0) +; RV64ZVE32F-NEXT: fsd fa5, 40(a0) +; RV64ZVE32F-NEXT: fsd fa6, 48(a0) +; RV64ZVE32F-NEXT: fsd fa7, 56(a0) +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB90_15: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_8 +; RV64ZVE32F-NEXT: .LBB90_16: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB90_9 +; RV64ZVE32F-NEXT: j .LBB90_10 %ptrs = getelementptr inbounds double, double* %base, <8 x i16> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v } define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(double* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) { -; RV32-LABEL: mgather_baseidx_sext_v8i16_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf4 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_sext_v8i16_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf4 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i16_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf4 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i16_v8f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf4 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_sext_v8i16_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB91_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB91_11 +; RV32ZVE32F-NEXT: .LBB91_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB91_12 +; RV32ZVE32F-NEXT: .LBB91_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB91_13 +; RV32ZVE32F-NEXT: .LBB91_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB91_14 +; RV32ZVE32F-NEXT: .LBB91_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB91_15 +; RV32ZVE32F-NEXT: .LBB91_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB91_16 +; RV32ZVE32F-NEXT: .LBB91_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB91_9 +; RV32ZVE32F-NEXT: .LBB91_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa7, 0(a1) +; RV32ZVE32F-NEXT: .LBB91_9: # %else20 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa4, 32(a0) +; RV32ZVE32F-NEXT: fsd fa5, 40(a0) +; RV32ZVE32F-NEXT: fsd fa6, 48(a0) +; RV32ZVE32F-NEXT: fsd fa7, 56(a0) +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB91_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB91_2 +; RV32ZVE32F-NEXT: .LBB91_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB91_3 +; RV32ZVE32F-NEXT: .LBB91_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB91_4 +; RV32ZVE32F-NEXT: .LBB91_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB91_5 +; RV32ZVE32F-NEXT: .LBB91_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB91_6 +; RV32ZVE32F-NEXT: .LBB91_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB91_7 +; RV32ZVE32F-NEXT: .LBB91_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB91_8 +; RV32ZVE32F-NEXT: j .LBB91_9 +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i16_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa0, 0(a3) +; RV64ZVE32F-NEXT: .LBB91_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa1, 0(a3) +; RV64ZVE32F-NEXT: .LBB91_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB91_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB91_15 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB91_16 +; RV64ZVE32F-NEXT: .LBB91_8: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_10 +; RV64ZVE32F-NEXT: .LBB91_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa5, 0(a3) +; RV64ZVE32F-NEXT: .LBB91_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa6, 0(a3) +; RV64ZVE32F-NEXT: .LBB91_12: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: fld fa7, 0(a1) +; RV64ZVE32F-NEXT: .LBB91_14: # %else20 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa4, 32(a0) +; RV64ZVE32F-NEXT: fsd fa5, 40(a0) +; RV64ZVE32F-NEXT: fsd fa6, 48(a0) +; RV64ZVE32F-NEXT: fsd fa7, 56(a0) +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB91_15: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_8 +; RV64ZVE32F-NEXT: .LBB91_16: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB91_9 +; RV64ZVE32F-NEXT: j .LBB91_10 %eidxs = sext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -1996,26 +11776,231 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(double* %base, <8 x i16> % } define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(double* %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) { -; RV32-LABEL: mgather_baseidx_zext_v8i16_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf4 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_zext_v8i16_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf4 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i16_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf4 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf4 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB92_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB92_11 +; RV32ZVE32F-NEXT: .LBB92_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB92_12 +; RV32ZVE32F-NEXT: .LBB92_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB92_13 +; RV32ZVE32F-NEXT: .LBB92_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB92_14 +; RV32ZVE32F-NEXT: .LBB92_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB92_15 +; RV32ZVE32F-NEXT: .LBB92_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB92_16 +; RV32ZVE32F-NEXT: .LBB92_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB92_9 +; RV32ZVE32F-NEXT: .LBB92_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa7, 0(a1) +; RV32ZVE32F-NEXT: .LBB92_9: # %else20 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa4, 32(a0) +; RV32ZVE32F-NEXT: fsd fa5, 40(a0) +; RV32ZVE32F-NEXT: fsd fa6, 48(a0) +; RV32ZVE32F-NEXT: fsd fa7, 56(a0) +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB92_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB92_2 +; RV32ZVE32F-NEXT: .LBB92_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB92_3 +; RV32ZVE32F-NEXT: .LBB92_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB92_4 +; RV32ZVE32F-NEXT: .LBB92_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB92_5 +; RV32ZVE32F-NEXT: .LBB92_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB92_6 +; RV32ZVE32F-NEXT: .LBB92_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB92_7 +; RV32ZVE32F-NEXT: .LBB92_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB92_8 +; RV32ZVE32F-NEXT: j .LBB92_9 +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: lui a2, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: addiw a2, a2, -1 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: and a4, a4, a2 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa0, 0(a4) +; RV64ZVE32F-NEXT: .LBB92_2: # %else +; RV64ZVE32F-NEXT: andi a4, a3, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: and a4, a4, a2 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa1, 0(a4) +; RV64ZVE32F-NEXT: .LBB92_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a4, a3, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: and a4, a4, a2 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa2, 0(a4) +; RV64ZVE32F-NEXT: .LBB92_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a4, a3, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a4, .LBB92_15 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a4, a3, 16 +; RV64ZVE32F-NEXT: bnez a4, .LBB92_16 +; RV64ZVE32F-NEXT: .LBB92_8: # %else11 +; RV64ZVE32F-NEXT: andi a4, a3, 32 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_10 +; RV64ZVE32F-NEXT: .LBB92_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: and a4, a4, a2 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa5, 0(a4) +; RV64ZVE32F-NEXT: .LBB92_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a4, a3, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: and a4, a4, a2 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa6, 0(a4) +; RV64ZVE32F-NEXT: .LBB92_12: # %else17 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: beqz a3, .LBB92_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a2, a3, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: fld fa7, 0(a1) +; RV64ZVE32F-NEXT: .LBB92_14: # %else20 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa4, 32(a0) +; RV64ZVE32F-NEXT: fsd fa5, 40(a0) +; RV64ZVE32F-NEXT: fsd fa6, 48(a0) +; RV64ZVE32F-NEXT: fsd fa7, 56(a0) +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB92_15: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: and a4, a4, a2 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa3, 0(a4) +; RV64ZVE32F-NEXT: andi a4, a3, 16 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_8 +; RV64ZVE32F-NEXT: .LBB92_16: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: and a4, a4, a2 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa4, 0(a4) +; RV64ZVE32F-NEXT: andi a4, a3, 32 +; RV64ZVE32F-NEXT: bnez a4, .LBB92_9 +; RV64ZVE32F-NEXT: j .LBB92_10 %eidxs = zext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -2023,49 +12008,441 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(double* %base, <8 x i16> % } define <8 x double> @mgather_baseidx_v8i32_v8f64(double* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x double> %passthru) { -; RV32-LABEL: mgather_baseidx_v8i32_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsll.vi v8, v8, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_v8i32_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsll.vi v8, v8, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8i32_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf2 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8i32_v8f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_v8i32_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB93_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB93_11 +; RV32ZVE32F-NEXT: .LBB93_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB93_12 +; RV32ZVE32F-NEXT: .LBB93_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB93_13 +; RV32ZVE32F-NEXT: .LBB93_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB93_14 +; RV32ZVE32F-NEXT: .LBB93_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB93_15 +; RV32ZVE32F-NEXT: .LBB93_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB93_16 +; RV32ZVE32F-NEXT: .LBB93_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB93_9 +; RV32ZVE32F-NEXT: .LBB93_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa7, 0(a1) +; RV32ZVE32F-NEXT: .LBB93_9: # %else20 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa4, 32(a0) +; RV32ZVE32F-NEXT: fsd fa5, 40(a0) +; RV32ZVE32F-NEXT: fsd fa6, 48(a0) +; RV32ZVE32F-NEXT: fsd fa7, 56(a0) +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB93_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB93_2 +; RV32ZVE32F-NEXT: .LBB93_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB93_3 +; RV32ZVE32F-NEXT: .LBB93_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB93_4 +; RV32ZVE32F-NEXT: .LBB93_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB93_5 +; RV32ZVE32F-NEXT: .LBB93_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB93_6 +; RV32ZVE32F-NEXT: .LBB93_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB93_7 +; RV32ZVE32F-NEXT: .LBB93_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB93_8 +; RV32ZVE32F-NEXT: j .LBB93_9 +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8i32_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB93_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa0, 0(a3) +; RV64ZVE32F-NEXT: .LBB93_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB93_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa1, 0(a3) +; RV64ZVE32F-NEXT: .LBB93_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB93_14 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: bnez a3, .LBB93_15 +; RV64ZVE32F-NEXT: .LBB93_6: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB93_16 +; RV64ZVE32F-NEXT: .LBB93_7: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB93_9 +; RV64ZVE32F-NEXT: .LBB93_8: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa5, 0(a3) +; RV64ZVE32F-NEXT: .LBB93_9: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB93_11 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa6, 0(a3) +; RV64ZVE32F-NEXT: .LBB93_11: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB93_13 +; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: fld fa7, 0(a1) +; RV64ZVE32F-NEXT: .LBB93_13: # %else20 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa4, 32(a0) +; RV64ZVE32F-NEXT: fsd fa5, 40(a0) +; RV64ZVE32F-NEXT: fsd fa6, 48(a0) +; RV64ZVE32F-NEXT: fsd fa7, 56(a0) +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB93_14: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB93_6 +; RV64ZVE32F-NEXT: .LBB93_15: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB93_7 +; RV64ZVE32F-NEXT: .LBB93_16: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB93_8 +; RV64ZVE32F-NEXT: j .LBB93_9 %ptrs = getelementptr inbounds double, double* %base, <8 x i32> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v } define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(double* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x double> %passthru) { -; RV32-LABEL: mgather_baseidx_sext_v8i32_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_sext_v8i32_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf2 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_sext_v8i32_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsext.vf2 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_sext_v8i32_v8f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_sext_v8i32_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB94_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB94_11 +; RV32ZVE32F-NEXT: .LBB94_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB94_12 +; RV32ZVE32F-NEXT: .LBB94_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB94_13 +; RV32ZVE32F-NEXT: .LBB94_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB94_14 +; RV32ZVE32F-NEXT: .LBB94_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB94_15 +; RV32ZVE32F-NEXT: .LBB94_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB94_16 +; RV32ZVE32F-NEXT: .LBB94_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB94_9 +; RV32ZVE32F-NEXT: .LBB94_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa7, 0(a1) +; RV32ZVE32F-NEXT: .LBB94_9: # %else20 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa4, 32(a0) +; RV32ZVE32F-NEXT: fsd fa5, 40(a0) +; RV32ZVE32F-NEXT: fsd fa6, 48(a0) +; RV32ZVE32F-NEXT: fsd fa7, 56(a0) +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB94_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB94_2 +; RV32ZVE32F-NEXT: .LBB94_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB94_3 +; RV32ZVE32F-NEXT: .LBB94_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB94_4 +; RV32ZVE32F-NEXT: .LBB94_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB94_5 +; RV32ZVE32F-NEXT: .LBB94_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB94_6 +; RV32ZVE32F-NEXT: .LBB94_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB94_7 +; RV32ZVE32F-NEXT: .LBB94_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB94_8 +; RV32ZVE32F-NEXT: j .LBB94_9 +; +; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i32_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB94_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa0, 0(a3) +; RV64ZVE32F-NEXT: .LBB94_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB94_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa1, 0(a3) +; RV64ZVE32F-NEXT: .LBB94_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB94_14 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: bnez a3, .LBB94_15 +; RV64ZVE32F-NEXT: .LBB94_6: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB94_16 +; RV64ZVE32F-NEXT: .LBB94_7: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB94_9 +; RV64ZVE32F-NEXT: .LBB94_8: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa5, 0(a3) +; RV64ZVE32F-NEXT: .LBB94_9: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB94_11 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa6, 0(a3) +; RV64ZVE32F-NEXT: .LBB94_11: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB94_13 +; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: fld fa7, 0(a1) +; RV64ZVE32F-NEXT: .LBB94_13: # %else20 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa4, 32(a0) +; RV64ZVE32F-NEXT: fsd fa5, 40(a0) +; RV64ZVE32F-NEXT: fsd fa6, 48(a0) +; RV64ZVE32F-NEXT: fsd fa7, 56(a0) +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB94_14: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB94_6 +; RV64ZVE32F-NEXT: .LBB94_15: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB94_7 +; RV64ZVE32F-NEXT: .LBB94_16: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB94_8 +; RV64ZVE32F-NEXT: j .LBB94_9 %eidxs = sext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -2073,26 +12450,230 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(double* %base, <8 x i32> % } define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(double* %base, <8 x i32> %idxs, <8 x i1> %m, <8 x double> %passthru) { -; RV32-LABEL: mgather_baseidx_zext_v8i32_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf2 v16, v8 -; RV32-NEXT: vsll.vi v8, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_zext_v8i32_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf2 v16, v8 +; RV32V-NEXT: vsll.vi v8, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_zext_v8i32_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vzext.vf2 v16, v8 -; RV64-NEXT: vsll.vi v8, v16, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_zext_v8i32_v8f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_zext_v8i32_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB95_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB95_11 +; RV32ZVE32F-NEXT: .LBB95_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB95_12 +; RV32ZVE32F-NEXT: .LBB95_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB95_13 +; RV32ZVE32F-NEXT: .LBB95_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB95_14 +; RV32ZVE32F-NEXT: .LBB95_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB95_15 +; RV32ZVE32F-NEXT: .LBB95_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB95_16 +; RV32ZVE32F-NEXT: .LBB95_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB95_9 +; RV32ZVE32F-NEXT: .LBB95_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa7, 0(a1) +; RV32ZVE32F-NEXT: .LBB95_9: # %else20 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa4, 32(a0) +; RV32ZVE32F-NEXT: fsd fa5, 40(a0) +; RV32ZVE32F-NEXT: fsd fa6, 48(a0) +; RV32ZVE32F-NEXT: fsd fa7, 56(a0) +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB95_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB95_2 +; RV32ZVE32F-NEXT: .LBB95_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB95_3 +; RV32ZVE32F-NEXT: .LBB95_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB95_4 +; RV32ZVE32F-NEXT: .LBB95_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB95_5 +; RV32ZVE32F-NEXT: .LBB95_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB95_6 +; RV32ZVE32F-NEXT: .LBB95_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB95_7 +; RV32ZVE32F-NEXT: .LBB95_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB95_8 +; RV32ZVE32F-NEXT: j .LBB95_9 +; +; RV64ZVE32F-LABEL: mgather_baseidx_zext_v8i32_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: beqz a3, .LBB95_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 32 +; RV64ZVE32F-NEXT: srli a3, a3, 29 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa0, 0(a3) +; RV64ZVE32F-NEXT: .LBB95_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB95_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 32 +; RV64ZVE32F-NEXT: srli a3, a3, 29 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa1, 0(a3) +; RV64ZVE32F-NEXT: .LBB95_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB95_14 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: bnez a3, .LBB95_15 +; RV64ZVE32F-NEXT: .LBB95_6: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB95_16 +; RV64ZVE32F-NEXT: .LBB95_7: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB95_9 +; RV64ZVE32F-NEXT: .LBB95_8: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 32 +; RV64ZVE32F-NEXT: srli a3, a3, 29 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa5, 0(a3) +; RV64ZVE32F-NEXT: .LBB95_9: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB95_11 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 32 +; RV64ZVE32F-NEXT: srli a3, a3, 29 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa6, 0(a3) +; RV64ZVE32F-NEXT: .LBB95_11: # %else17 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB95_13 +; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 32 +; RV64ZVE32F-NEXT: srli a2, a2, 29 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: fld fa7, 0(a1) +; RV64ZVE32F-NEXT: .LBB95_13: # %else20 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa4, 32(a0) +; RV64ZVE32F-NEXT: fsd fa5, 40(a0) +; RV64ZVE32F-NEXT: fsd fa6, 48(a0) +; RV64ZVE32F-NEXT: fsd fa7, 56(a0) +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB95_14: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 32 +; RV64ZVE32F-NEXT: srli a3, a3, 29 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB95_6 +; RV64ZVE32F-NEXT: .LBB95_15: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 32 +; RV64ZVE32F-NEXT: srli a3, a3, 29 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB95_7 +; RV64ZVE32F-NEXT: .LBB95_16: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 32 +; RV64ZVE32F-NEXT: srli a3, a3, 29 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB95_8 +; RV64ZVE32F-NEXT: j .LBB95_9 %eidxs = zext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -2100,24 +12681,243 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(double* %base, <8 x i32> % } define <8 x double> @mgather_baseidx_v8f64(double* %base, <8 x i64> %idxs, <8 x i1> %m, <8 x double> %passthru) { -; RV32-LABEL: mgather_baseidx_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsll.vi v8, v8, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret +; RV32V-LABEL: mgather_baseidx_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsll.vi v8, v8, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v8 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vluxei32.v v12, (a0), v16, v0.t +; RV32V-NEXT: vmv.v.v v8, v12 +; RV32V-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vsll.vi v8, v8, 3 -; RV64-NEXT: vluxei64.v v12, (a0), v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v8f64: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsll.vi v8, v8, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t +; RV64V-NEXT: vmv.v.v v8, v12 +; RV64V-NEXT: ret +; +; RV32ZVE32F-LABEL: mgather_baseidx_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -96 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 96 +; RV32ZVE32F-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset ra, -4 +; RV32ZVE32F-NEXT: .cfi_offset s0, -8 +; RV32ZVE32F-NEXT: addi s0, sp, 96 +; RV32ZVE32F-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVE32F-NEXT: andi sp, sp, -32 +; RV32ZVE32F-NEXT: lw a3, 0(a2) +; RV32ZVE32F-NEXT: lw a4, 8(a2) +; RV32ZVE32F-NEXT: lw a5, 16(a2) +; RV32ZVE32F-NEXT: lw a6, 24(a2) +; RV32ZVE32F-NEXT: lw a7, 56(a2) +; RV32ZVE32F-NEXT: lw t0, 48(a2) +; RV32ZVE32F-NEXT: lw t1, 40(a2) +; RV32ZVE32F-NEXT: lw a2, 32(a2) +; RV32ZVE32F-NEXT: sw a7, 60(sp) +; RV32ZVE32F-NEXT: sw t0, 56(sp) +; RV32ZVE32F-NEXT: sw t1, 52(sp) +; RV32ZVE32F-NEXT: sw a2, 48(sp) +; RV32ZVE32F-NEXT: sw a6, 44(sp) +; RV32ZVE32F-NEXT: sw a5, 40(sp) +; RV32ZVE32F-NEXT: sw a4, 36(sp) +; RV32ZVE32F-NEXT: sw a3, 32(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: addi a2, sp, 32 +; RV32ZVE32F-NEXT: vle32.v v8, (a2) +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB96_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: bnez a2, .LBB96_11 +; RV32ZVE32F-NEXT: .LBB96_2: # %else2 +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: bnez a2, .LBB96_12 +; RV32ZVE32F-NEXT: .LBB96_3: # %else5 +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: bnez a2, .LBB96_13 +; RV32ZVE32F-NEXT: .LBB96_4: # %else8 +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: bnez a2, .LBB96_14 +; RV32ZVE32F-NEXT: .LBB96_5: # %else11 +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: bnez a2, .LBB96_15 +; RV32ZVE32F-NEXT: .LBB96_6: # %else14 +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: bnez a2, .LBB96_16 +; RV32ZVE32F-NEXT: .LBB96_7: # %else17 +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: beqz a1, .LBB96_9 +; RV32ZVE32F-NEXT: .LBB96_8: # %cond.load19 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa7, 0(a1) +; RV32ZVE32F-NEXT: .LBB96_9: # %else20 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: fsd fa1, 8(a0) +; RV32ZVE32F-NEXT: fsd fa2, 16(a0) +; RV32ZVE32F-NEXT: fsd fa3, 24(a0) +; RV32ZVE32F-NEXT: fsd fa4, 32(a0) +; RV32ZVE32F-NEXT: fsd fa5, 40(a0) +; RV32ZVE32F-NEXT: fsd fa6, 48(a0) +; RV32ZVE32F-NEXT: fsd fa7, 56(a0) +; RV32ZVE32F-NEXT: addi sp, s0, -96 +; RV32ZVE32F-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 96 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB96_10: # %cond.load +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 2 +; RV32ZVE32F-NEXT: beqz a2, .LBB96_2 +; RV32ZVE32F-NEXT: .LBB96_11: # %cond.load1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 4 +; RV32ZVE32F-NEXT: beqz a2, .LBB96_3 +; RV32ZVE32F-NEXT: .LBB96_12: # %cond.load4 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 8 +; RV32ZVE32F-NEXT: beqz a2, .LBB96_4 +; RV32ZVE32F-NEXT: .LBB96_13: # %cond.load7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 16 +; RV32ZVE32F-NEXT: beqz a2, .LBB96_5 +; RV32ZVE32F-NEXT: .LBB96_14: # %cond.load10 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 32 +; RV32ZVE32F-NEXT: beqz a2, .LBB96_6 +; RV32ZVE32F-NEXT: .LBB96_15: # %cond.load13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a2) +; RV32ZVE32F-NEXT: andi a2, a1, 64 +; RV32ZVE32F-NEXT: beqz a2, .LBB96_7 +; RV32ZVE32F-NEXT: .LBB96_16: # %cond.load16 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a2, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a2) +; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: bnez a1, .LBB96_8 +; RV32ZVE32F-NEXT: j .LBB96_9 +; +; RV64ZVE32F-LABEL: mgather_baseidx_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: bnez a4, .LBB96_10 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a3, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB96_11 +; RV64ZVE32F-NEXT: .LBB96_2: # %else2 +; RV64ZVE32F-NEXT: andi a4, a3, 4 +; RV64ZVE32F-NEXT: bnez a4, .LBB96_12 +; RV64ZVE32F-NEXT: .LBB96_3: # %else5 +; RV64ZVE32F-NEXT: andi a4, a3, 8 +; RV64ZVE32F-NEXT: bnez a4, .LBB96_13 +; RV64ZVE32F-NEXT: .LBB96_4: # %else8 +; RV64ZVE32F-NEXT: andi a4, a3, 16 +; RV64ZVE32F-NEXT: bnez a4, .LBB96_14 +; RV64ZVE32F-NEXT: .LBB96_5: # %else11 +; RV64ZVE32F-NEXT: andi a4, a3, 32 +; RV64ZVE32F-NEXT: bnez a4, .LBB96_15 +; RV64ZVE32F-NEXT: .LBB96_6: # %else14 +; RV64ZVE32F-NEXT: andi a4, a3, 64 +; RV64ZVE32F-NEXT: bnez a4, .LBB96_16 +; RV64ZVE32F-NEXT: .LBB96_7: # %else17 +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: beqz a3, .LBB96_9 +; RV64ZVE32F-NEXT: .LBB96_8: # %cond.load19 +; RV64ZVE32F-NEXT: ld a2, 56(a2) +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a1, a1, a2 +; RV64ZVE32F-NEXT: fld fa7, 0(a1) +; RV64ZVE32F-NEXT: .LBB96_9: # %else20 +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa1, 8(a0) +; RV64ZVE32F-NEXT: fsd fa2, 16(a0) +; RV64ZVE32F-NEXT: fsd fa3, 24(a0) +; RV64ZVE32F-NEXT: fsd fa4, 32(a0) +; RV64ZVE32F-NEXT: fsd fa5, 40(a0) +; RV64ZVE32F-NEXT: fsd fa6, 48(a0) +; RV64ZVE32F-NEXT: fsd fa7, 56(a0) +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB96_10: # %cond.load +; RV64ZVE32F-NEXT: ld a4, 0(a2) +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa0, 0(a4) +; RV64ZVE32F-NEXT: andi a4, a3, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB96_2 +; RV64ZVE32F-NEXT: .LBB96_11: # %cond.load1 +; RV64ZVE32F-NEXT: ld a4, 8(a2) +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa1, 0(a4) +; RV64ZVE32F-NEXT: andi a4, a3, 4 +; RV64ZVE32F-NEXT: beqz a4, .LBB96_3 +; RV64ZVE32F-NEXT: .LBB96_12: # %cond.load4 +; RV64ZVE32F-NEXT: ld a4, 16(a2) +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa2, 0(a4) +; RV64ZVE32F-NEXT: andi a4, a3, 8 +; RV64ZVE32F-NEXT: beqz a4, .LBB96_4 +; RV64ZVE32F-NEXT: .LBB96_13: # %cond.load7 +; RV64ZVE32F-NEXT: ld a4, 24(a2) +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa3, 0(a4) +; RV64ZVE32F-NEXT: andi a4, a3, 16 +; RV64ZVE32F-NEXT: beqz a4, .LBB96_5 +; RV64ZVE32F-NEXT: .LBB96_14: # %cond.load10 +; RV64ZVE32F-NEXT: ld a4, 32(a2) +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa4, 0(a4) +; RV64ZVE32F-NEXT: andi a4, a3, 32 +; RV64ZVE32F-NEXT: beqz a4, .LBB96_6 +; RV64ZVE32F-NEXT: .LBB96_15: # %cond.load13 +; RV64ZVE32F-NEXT: ld a4, 40(a2) +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa5, 0(a4) +; RV64ZVE32F-NEXT: andi a4, a3, 64 +; RV64ZVE32F-NEXT: beqz a4, .LBB96_7 +; RV64ZVE32F-NEXT: .LBB96_16: # %cond.load16 +; RV64ZVE32F-NEXT: ld a4, 48(a2) +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa6, 0(a4) +; RV64ZVE32F-NEXT: andi a3, a3, -128 +; RV64ZVE32F-NEXT: bnez a3, .LBB96_8 +; RV64ZVE32F-NEXT: j .LBB96_9 %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v @@ -2135,14 +12935,234 @@ define <16 x i8> @mgather_baseidx_v16i8(i8* %base, <16 x i8> %idxs, <16 x i1> %m ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v16i8: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v8 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v9, (a0), v16, v0.t -; RV64-NEXT: vmv.v.v v8, v9 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v16i8: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v16, v0.t +; RV64V-NEXT: vmv.v.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v16i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: .LBB97_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 +; RV64ZVE32F-NEXT: .LBB97_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 2 +; RV64ZVE32F-NEXT: .LBB97_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_28 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_29 +; RV64ZVE32F-NEXT: .LBB97_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_10 +; RV64ZVE32F-NEXT: .LBB97_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 +; RV64ZVE32F-NEXT: .LBB97_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_30 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_31 +; RV64ZVE32F-NEXT: .LBB97_12: # %else20 +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_32 +; RV64ZVE32F-NEXT: .LBB97_13: # %else23 +; RV64ZVE32F-NEXT: andi a2, a1, 512 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_15 +; RV64ZVE32F-NEXT: .LBB97_14: # %cond.load25 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 9 +; RV64ZVE32F-NEXT: .LBB97_15: # %else26 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 1024 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_17 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load28 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 10 +; RV64ZVE32F-NEXT: .LBB97_17: # %else29 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: lui a2, 1 +; RV64ZVE32F-NEXT: addiw a3, a2, -2048 +; RV64ZVE32F-NEXT: and a3, a1, a3 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB97_19 +; RV64ZVE32F-NEXT: # %bb.18: # %cond.load31 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lb a3, 0(a3) +; RV64ZVE32F-NEXT: vmv.s.x v10, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 11 +; RV64ZVE32F-NEXT: .LBB97_19: # %else32 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_21 +; RV64ZVE32F-NEXT: # %bb.20: # %cond.load34 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 13, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 12 +; RV64ZVE32F-NEXT: .LBB97_21: # %else35 +; RV64ZVE32F-NEXT: lui a2, 2 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_23 +; RV64ZVE32F-NEXT: # %bb.22: # %cond.load37 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 13 +; RV64ZVE32F-NEXT: .LBB97_23: # %else38 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 4 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_25 +; RV64ZVE32F-NEXT: # %bb.24: # %cond.load40 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 15, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 14 +; RV64ZVE32F-NEXT: .LBB97_25: # %else41 +; RV64ZVE32F-NEXT: lui a2, 1048568 +; RV64ZVE32F-NEXT: and a1, a1, a2 +; RV64ZVE32F-NEXT: beqz a1, .LBB97_27 +; RV64ZVE32F-NEXT: # %bb.26: # %cond.load43 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 15 +; RV64ZVE32F-NEXT: .LBB97_27: # %else44 +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB97_28: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_8 +; RV64ZVE32F-NEXT: .LBB97_29: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_9 +; RV64ZVE32F-NEXT: j .LBB97_10 +; RV64ZVE32F-NEXT: .LBB97_30: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 6 +; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_12 +; RV64ZVE32F-NEXT: .LBB97_31: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 7 +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_13 +; RV64ZVE32F-NEXT: .LBB97_32: # %cond.load22 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m1, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 8 +; RV64ZVE32F-NEXT: andi a2, a1, 512 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_14 +; RV64ZVE32F-NEXT: j .LBB97_15 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %idxs %v = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 2, <16 x i1> %m, <16 x i8> %passthru) ret <16 x i8> %v @@ -2161,27 +13181,518 @@ define <32 x i8> @mgather_baseidx_v32i8(i8* %base, <32 x i8> %idxs, <32 x i1> %m ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; -; RV64-LABEL: mgather_baseidx_v32i8: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v8 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64-NEXT: vmv1r.v v12, v10 -; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t -; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, mu -; RV64-NEXT: vslidedown.vi v10, v10, 16 -; RV64-NEXT: vslidedown.vi v8, v8, 16 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v8 -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsetvli zero, a0, e8, m2, tu, mu -; RV64-NEXT: vslideup.vi v12, v10, 16 -; RV64-NEXT: vmv2r.v v8, v12 -; RV64-NEXT: ret +; RV64V-LABEL: mgather_baseidx_v32i8: +; RV64V: # %bb.0: +; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64V-NEXT: vmv1r.v v12, v10 +; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64V-NEXT: vslidedown.vi v10, v10, 16 +; RV64V-NEXT: vslidedown.vi v8, v8, 16 +; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64V-NEXT: vslidedown.vi v0, v0, 2 +; RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64V-NEXT: vluxei64.v v10, (a0), v16, v0.t +; RV64V-NEXT: li a0, 32 +; RV64V-NEXT: vsetvli zero, a0, e8, m2, tu, mu +; RV64V-NEXT: vslideup.vi v12, v10, 16 +; RV64V-NEXT: vmv2r.v v8, v12 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v32i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: .LBB98_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 +; RV64ZVE32F-NEXT: .LBB98_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 +; RV64ZVE32F-NEXT: .LBB98_6: # %else5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_60 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_61 +; RV64ZVE32F-NEXT: .LBB98_8: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_10 +; RV64ZVE32F-NEXT: .LBB98_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 5 +; RV64ZVE32F-NEXT: .LBB98_10: # %else14 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_62 +; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_63 +; RV64ZVE32F-NEXT: .LBB98_12: # %else20 +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_64 +; RV64ZVE32F-NEXT: .LBB98_13: # %else23 +; RV64ZVE32F-NEXT: andi a2, a1, 512 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_15 +; RV64ZVE32F-NEXT: .LBB98_14: # %cond.load25 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 9 +; RV64ZVE32F-NEXT: .LBB98_15: # %else26 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 1024 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_17 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load28 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 10 +; RV64ZVE32F-NEXT: .LBB98_17: # %else29 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: lui a2, 1 +; RV64ZVE32F-NEXT: addiw a3, a2, -2048 +; RV64ZVE32F-NEXT: and a3, a1, a3 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB98_19 +; RV64ZVE32F-NEXT: # %bb.18: # %cond.load31 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v13 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lb a3, 0(a3) +; RV64ZVE32F-NEXT: li a4, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a4, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 11 +; RV64ZVE32F-NEXT: .LBB98_19: # %else32 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_21 +; RV64ZVE32F-NEXT: # %bb.20: # %cond.load34 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 13, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 12 +; RV64ZVE32F-NEXT: .LBB98_21: # %else35 +; RV64ZVE32F-NEXT: lui a2, 2 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_23 +; RV64ZVE32F-NEXT: # %bb.22: # %cond.load37 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 13 +; RV64ZVE32F-NEXT: .LBB98_23: # %else38 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 4 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_25 +; RV64ZVE32F-NEXT: # %bb.24: # %cond.load40 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 15, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 14 +; RV64ZVE32F-NEXT: .LBB98_25: # %else41 +; RV64ZVE32F-NEXT: lui a2, 8 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_27 +; RV64ZVE32F-NEXT: # %bb.26: # %cond.load43 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 15 +; RV64ZVE32F-NEXT: .LBB98_27: # %else44 +; RV64ZVE32F-NEXT: lui a2, 16 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_29 +; RV64ZVE32F-NEXT: # %bb.28: # %cond.load46 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 17, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 16 +; RV64ZVE32F-NEXT: .LBB98_29: # %else47 +; RV64ZVE32F-NEXT: lui a2, 32 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_31 +; RV64ZVE32F-NEXT: # %bb.30: # %cond.load49 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 18, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 17 +; RV64ZVE32F-NEXT: .LBB98_31: # %else50 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 64 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_33 +; RV64ZVE32F-NEXT: # %bb.32: # %cond.load52 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 19, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 18 +; RV64ZVE32F-NEXT: .LBB98_33: # %else53 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: lui a2, 128 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_35 +; RV64ZVE32F-NEXT: # %bb.34: # %cond.load55 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 20, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 19 +; RV64ZVE32F-NEXT: .LBB98_35: # %else56 +; RV64ZVE32F-NEXT: lui a2, 256 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_37 +; RV64ZVE32F-NEXT: # %bb.36: # %cond.load58 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 21, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 20 +; RV64ZVE32F-NEXT: .LBB98_37: # %else59 +; RV64ZVE32F-NEXT: lui a2, 512 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_39 +; RV64ZVE32F-NEXT: # %bb.38: # %cond.load61 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 22, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 21 +; RV64ZVE32F-NEXT: .LBB98_39: # %else62 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 1024 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_41 +; RV64ZVE32F-NEXT: # %bb.40: # %cond.load64 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 23, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 22 +; RV64ZVE32F-NEXT: .LBB98_41: # %else65 +; RV64ZVE32F-NEXT: lui a2, 2048 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_43 +; RV64ZVE32F-NEXT: # %bb.42: # %cond.load67 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 24, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 23 +; RV64ZVE32F-NEXT: .LBB98_43: # %else68 +; RV64ZVE32F-NEXT: lui a2, 4096 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_45 +; RV64ZVE32F-NEXT: # %bb.44: # %cond.load70 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 25, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 24 +; RV64ZVE32F-NEXT: .LBB98_45: # %else71 +; RV64ZVE32F-NEXT: lui a2, 8192 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_47 +; RV64ZVE32F-NEXT: # %bb.46: # %cond.load73 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 26, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 25 +; RV64ZVE32F-NEXT: .LBB98_47: # %else74 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 16384 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_49 +; RV64ZVE32F-NEXT: # %bb.48: # %cond.load76 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 27, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 26 +; RV64ZVE32F-NEXT: .LBB98_49: # %else77 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: lui a2, 32768 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_51 +; RV64ZVE32F-NEXT: # %bb.50: # %cond.load79 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 28, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 27 +; RV64ZVE32F-NEXT: .LBB98_51: # %else80 +; RV64ZVE32F-NEXT: lui a2, 65536 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_53 +; RV64ZVE32F-NEXT: # %bb.52: # %cond.load82 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 29, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 28 +; RV64ZVE32F-NEXT: .LBB98_53: # %else83 +; RV64ZVE32F-NEXT: lui a2, 131072 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_55 +; RV64ZVE32F-NEXT: # %bb.54: # %cond.load85 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 30, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 29 +; RV64ZVE32F-NEXT: .LBB98_55: # %else86 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 262144 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_57 +; RV64ZVE32F-NEXT: # %bb.56: # %cond.load88 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 31, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 30 +; RV64ZVE32F-NEXT: .LBB98_57: # %else89 +; RV64ZVE32F-NEXT: lui a2, 524288 +; RV64ZVE32F-NEXT: and a1, a1, a2 +; RV64ZVE32F-NEXT: beqz a1, .LBB98_59 +; RV64ZVE32F-NEXT: # %bb.58: # %cond.load91 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: lb a0, 0(a0) +; RV64ZVE32F-NEXT: li a1, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 31 +; RV64ZVE32F-NEXT: .LBB98_59: # %else92 +; RV64ZVE32F-NEXT: vmv2r.v v8, v10 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB98_60: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_8 +; RV64ZVE32F-NEXT: .LBB98_61: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 4 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_9 +; RV64ZVE32F-NEXT: j .LBB98_10 +; RV64ZVE32F-NEXT: .LBB98_62: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 6 +; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_12 +; RV64ZVE32F-NEXT: .LBB98_63: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 7 +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_13 +; RV64ZVE32F-NEXT: .LBB98_64: # %cond.load22 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lb a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 8 +; RV64ZVE32F-NEXT: andi a2, a1, 512 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_14 +; RV64ZVE32F-NEXT: j .LBB98_15 %ptrs = getelementptr inbounds i8, i8* %base, <32 x i8> %idxs %v = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %ptrs, i32 2, <32 x i1> %m, <32 x i8> %passthru) ret <32 x i8> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index bc1f4b02d219..9ae089d3cefc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -1,23 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+experimental-zvfh,+v -target-abi=ilp32d \ -; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+experimental-zvfh,+v -target-abi=lp64d \ ; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+experimental-zvfh,+zve32f -target-abi=ilp32d \ +; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+experimental-zvfh,+zve32f -target-abi=lp64d \ +; RUN: -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64ZVE32F declare void @llvm.masked.scatter.v1i8.v1p0i8(<1 x i8>, <1 x i8*>, i32, <1 x i1>) define void @mscatter_v1i8(<1 x i8> %val, <1 x i8*> %ptrs, <1 x i1> %m) { -; RV32-LABEL: mscatter_v1i8: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v1i8: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v1i8: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v1i8: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v1i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB0_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vse8.v v8, (a0) +; RV64ZVE32F-NEXT: .LBB0_2: # %else +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v1i8.v1p0i8(<1 x i8> %val, <1 x i8*> %ptrs, i32 1, <1 x i1> %m) ret void } @@ -25,28 +49,70 @@ define void @mscatter_v1i8(<1 x i8> %val, <1 x i8*> %ptrs, <1 x i1> %m) { declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>) define void @mscatter_v2i8(<2 x i8> %val, <2 x i8*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2i8: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2i8: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2i8: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2i8: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB1_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB1_4 +; RV64ZVE32F-NEXT: .LBB1_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB1_3: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vse8.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB1_2 +; RV64ZVE32F-NEXT: .LBB1_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %val, <2 x i8*> %ptrs, i32 1, <2 x i1> %m) ret void } define void @mscatter_v2i16_truncstore_v2i8(<2 x i16> %val, <2 x i8*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2i16_truncstore_v2i8: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu -; RV32-NEXT: vncvt.x.x.w v8, v8 -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2i16_truncstore_v2i8: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; RV32V-NEXT: vncvt.x.x.w v8, v8 +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2i16_truncstore_v2i8: ; RV64: # %bb.0: @@ -54,20 +120,65 @@ define void @mscatter_v2i16_truncstore_v2i8(<2 x i16> %val, <2 x i8*> %ptrs, <2 ; RV64-NEXT: vncvt.x.x.w v8, v8 ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2i16_truncstore_v2i8: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vncvt.x.x.w v8, v8 +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2i16_truncstore_v2i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8 +; RV64ZVE32F-NEXT: bnez a3, .LBB2_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB2_4 +; RV64ZVE32F-NEXT: .LBB2_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB2_3: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vse8.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB2_2 +; RV64ZVE32F-NEXT: .LBB2_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i16> %val to <2 x i8> call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m) ret void } define void @mscatter_v2i32_truncstore_v2i8(<2 x i32> %val, <2 x i8*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2i32_truncstore_v2i8: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV32-NEXT: vncvt.x.x.w v8, v8 -; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; RV32-NEXT: vncvt.x.x.w v8, v8 -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2i32_truncstore_v2i8: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32V-NEXT: vncvt.x.x.w v8, v8 +; RV32V-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; RV32V-NEXT: vncvt.x.x.w v8, v8 +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2i32_truncstore_v2i8: ; RV64: # %bb.0: @@ -77,22 +188,71 @@ define void @mscatter_v2i32_truncstore_v2i8(<2 x i32> %val, <2 x i8*> %ptrs, <2 ; RV64-NEXT: vncvt.x.x.w v8, v8 ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2i32_truncstore_v2i8: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vncvt.x.x.w v8, v8 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vncvt.x.x.w v8, v8 +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2i32_truncstore_v2i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8 +; RV64ZVE32F-NEXT: bnez a3, .LBB3_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB3_4 +; RV64ZVE32F-NEXT: .LBB3_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB3_3: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vse8.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB3_2 +; RV64ZVE32F-NEXT: .LBB3_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i32> %val to <2 x i8> call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m) ret void } define void @mscatter_v2i64_truncstore_v2i8(<2 x i64> %val, <2 x i8*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2i64_truncstore_v2i8: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vncvt.x.x.w v8, v8 -; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; RV32-NEXT: vncvt.x.x.w v8, v8 -; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; RV32-NEXT: vncvt.x.x.w v8, v8 -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2i64_truncstore_v2i8: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v8, v8 +; RV32V-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32V-NEXT: vncvt.x.x.w v8, v8 +; RV32V-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; RV32V-NEXT: vncvt.x.x.w v8, v8 +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2i64_truncstore_v2i8: ; RV64: # %bb.0: @@ -104,6 +264,70 @@ define void @mscatter_v2i64_truncstore_v2i8(<2 x i64> %val, <2 x i8*> %ptrs, <2 ; RV64-NEXT: vncvt.x.x.w v8, v8 ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i8: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: lw a1, 8(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: sb a1, 15(sp) +; RV32ZVE32F-NEXT: sb a0, 14(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: addi a0, sp, 15 +; RV32ZVE32F-NEXT: vle8.v v9, (a0) +; RV32ZVE32F-NEXT: addi a0, sp, 14 +; RV32ZVE32F-NEXT: vle8.v v10, (a0) +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 1 +; RV32ZVE32F-NEXT: vsoxei32.v v10, (zero), v8, v0.t +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a4, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a4) +; RV64ZVE32F-NEXT: lbu a4, 15(sp) +; RV64ZVE32F-NEXT: sb a1, 14(sp) +; RV64ZVE32F-NEXT: sb a0, 13(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: addi a0, sp, 14 +; RV64ZVE32F-NEXT: vle8.v v9, (a0) +; RV64ZVE32F-NEXT: addi a0, sp, 13 +; RV64ZVE32F-NEXT: vle8.v v8, (a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, mu +; RV64ZVE32F-NEXT: andi a0, a4, 1 +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: bnez a0, .LBB4_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB4_4 +; RV64ZVE32F-NEXT: .LBB4_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB4_3: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vse8.v v8, (a2) +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB4_2 +; RV64ZVE32F-NEXT: .LBB4_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v8, (a3) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i64> %val to <2 x i8> call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %tval, <2 x i8*> %ptrs, i32 1, <2 x i1> %m) ret void @@ -123,6 +347,64 @@ define void @mscatter_v4i8(<4 x i8> %val, <4 x i8*> %ptrs, <4 x i1> %m) { ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v4i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a1) +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: bnez a5, .LBB5_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB5_6 +; RV64ZVE32F-NEXT: .LBB5_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB5_7 +; RV64ZVE32F-NEXT: .LBB5_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB5_8 +; RV64ZVE32F-NEXT: .LBB5_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB5_5: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vse8.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB5_2 +; RV64ZVE32F-NEXT: .LBB5_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB5_3 +; RV64ZVE32F-NEXT: .LBB5_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v9, (a3) +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB5_4 +; RV64ZVE32F-NEXT: .LBB5_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> %m) ret void } @@ -139,6 +421,64 @@ define void @mscatter_truemask_v4i8(<4 x i8> %val, <4 x i8*> %ptrs) { ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10 ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_truemask_v4i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 8(a0) +; RV64ZVE32F-NEXT: ld a4, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB6_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB6_6 +; RV64ZVE32F-NEXT: .LBB6_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB6_7 +; RV64ZVE32F-NEXT: .LBB6_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB6_8 +; RV64ZVE32F-NEXT: .LBB6_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB6_5: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vse8.v v8, (a4) +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB6_2 +; RV64ZVE32F-NEXT: .LBB6_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v9, (a3) +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB6_3 +; RV64ZVE32F-NEXT: .LBB6_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB6_4 +; RV64ZVE32F-NEXT: .LBB6_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> %mtrue) @@ -167,6 +507,90 @@ define void @mscatter_v8i8(<8 x i8> %val, <8 x i8*> %ptrs, <8 x i1> %m) { ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v8i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a1, 56(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi t1, a3, 1 +; RV64ZVE32F-NEXT: bnez t1, .LBB8_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB8_10 +; RV64ZVE32F-NEXT: .LBB8_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB8_11 +; RV64ZVE32F-NEXT: .LBB8_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB8_12 +; RV64ZVE32F-NEXT: .LBB8_4: # %else6 +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB8_13 +; RV64ZVE32F-NEXT: .LBB8_5: # %else8 +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB8_14 +; RV64ZVE32F-NEXT: .LBB8_6: # %else10 +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: bnez a0, .LBB8_15 +; RV64ZVE32F-NEXT: .LBB8_7: # %else12 +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB8_16 +; RV64ZVE32F-NEXT: .LBB8_8: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB8_9: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vse8.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB8_2 +; RV64ZVE32F-NEXT: .LBB8_10: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v9, (t0) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB8_3 +; RV64ZVE32F-NEXT: .LBB8_11: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v9, (a7) +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB8_4 +; RV64ZVE32F-NEXT: .LBB8_12: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v9, (a6) +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB8_5 +; RV64ZVE32F-NEXT: .LBB8_13: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse8.v v9, (a5) +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB8_6 +; RV64ZVE32F-NEXT: .LBB8_14: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse8.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: beqz a0, .LBB8_7 +; RV64ZVE32F-NEXT: .LBB8_15: # %cond.store11 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 6 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB8_8 +; RV64ZVE32F-NEXT: .LBB8_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse8.v v8, (a1) +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %val, <8 x i8*> %ptrs, i32 1, <8 x i1> %m) ret void } @@ -187,6 +611,106 @@ define void @mscatter_baseidx_v8i8(<8 x i8> %val, i8* %base, <8 x i8> %idxs, <8 ; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vse8.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB9_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB9_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB9_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB9_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB9_14 +; RV64ZVE32F-NEXT: .LBB9_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_10 +; RV64ZVE32F-NEXT: .LBB9_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB9_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB9_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB9_16 +; RV64ZVE32F-NEXT: .LBB9_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB9_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_8 +; RV64ZVE32F-NEXT: .LBB9_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB9_9 +; RV64ZVE32F-NEXT: j .LBB9_10 +; RV64ZVE32F-NEXT: .LBB9_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB9_12 +; RV64ZVE32F-NEXT: .LBB9_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse8.v v8, (a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i8, i8* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %val, <8 x i8*> %ptrs, i32 1, <8 x i1> %m) ret void @@ -195,17 +719,37 @@ define void @mscatter_baseidx_v8i8(<8 x i8> %val, i8* %base, <8 x i8> %idxs, <8 declare void @llvm.masked.scatter.v1i16.v1p0i16(<1 x i16>, <1 x i16*>, i32, <1 x i1>) define void @mscatter_v1i16(<1 x i16> %val, <1 x i16*> %ptrs, <1 x i1> %m) { -; RV32-LABEL: mscatter_v1i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v1i16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v1i16: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v1i16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v1i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB10_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: .LBB10_2: # %else +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v1i16.v1p0i16(<1 x i16> %val, <1 x i16*> %ptrs, i32 2, <1 x i1> %m) ret void } @@ -213,28 +757,70 @@ define void @mscatter_v1i16(<1 x i16> %val, <1 x i16*> %ptrs, <1 x i1> %m) { declare void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16>, <2 x i16*>, i32, <2 x i1>) define void @mscatter_v2i16(<2 x i16> %val, <2 x i16*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2i16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2i16: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2i16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB11_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB11_4 +; RV64ZVE32F-NEXT: .LBB11_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB11_3: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB11_2 +; RV64ZVE32F-NEXT: .LBB11_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %val, <2 x i16*> %ptrs, i32 2, <2 x i1> %m) ret void } define void @mscatter_v2i32_truncstore_v2i16(<2 x i32> %val, <2 x i16*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2i32_truncstore_v2i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV32-NEXT: vncvt.x.x.w v8, v8 -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2i32_truncstore_v2i16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32V-NEXT: vncvt.x.x.w v8, v8 +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2i32_truncstore_v2i16: ; RV64: # %bb.0: @@ -242,20 +828,65 @@ define void @mscatter_v2i32_truncstore_v2i16(<2 x i32> %val, <2 x i16*> %ptrs, < ; RV64-NEXT: vncvt.x.x.w v8, v8 ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2i32_truncstore_v2i16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vncvt.x.x.w v8, v8 +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2i32_truncstore_v2i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: vncvt.x.x.w v8, v8 +; RV64ZVE32F-NEXT: bnez a3, .LBB12_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB12_4 +; RV64ZVE32F-NEXT: .LBB12_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB12_3: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB12_2 +; RV64ZVE32F-NEXT: .LBB12_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i32> %val to <2 x i16> call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %tval, <2 x i16*> %ptrs, i32 2, <2 x i1> %m) ret void } define void @mscatter_v2i64_truncstore_v2i16(<2 x i64> %val, <2 x i16*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2i64_truncstore_v2i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vncvt.x.x.w v8, v8 -; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; RV32-NEXT: vncvt.x.x.w v8, v8 -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2i64_truncstore_v2i16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v8, v8 +; RV32V-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32V-NEXT: vncvt.x.x.w v8, v8 +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2i64_truncstore_v2i16: ; RV64: # %bb.0: @@ -265,6 +896,70 @@ define void @mscatter_v2i64_truncstore_v2i16(<2 x i64> %val, <2 x i16*> %ptrs, < ; RV64-NEXT: vncvt.x.x.w v8, v8 ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: lw a1, 8(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: sh a1, 14(sp) +; RV32ZVE32F-NEXT: sh a0, 12(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: addi a0, sp, 14 +; RV32ZVE32F-NEXT: vle16.v v9, (a0) +; RV32ZVE32F-NEXT: addi a0, sp, 12 +; RV32ZVE32F-NEXT: vle16.v v10, (a0) +; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 1 +; RV32ZVE32F-NEXT: vsoxei32.v v10, (zero), v8, v0.t +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a4, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a4) +; RV64ZVE32F-NEXT: lbu a4, 15(sp) +; RV64ZVE32F-NEXT: sh a1, 12(sp) +; RV64ZVE32F-NEXT: sh a0, 10(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: addi a0, sp, 12 +; RV64ZVE32F-NEXT: vle16.v v9, (a0) +; RV64ZVE32F-NEXT: addi a0, sp, 10 +; RV64ZVE32F-NEXT: vle16.v v8, (a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, mu +; RV64ZVE32F-NEXT: andi a0, a4, 1 +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: bnez a0, .LBB13_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB13_4 +; RV64ZVE32F-NEXT: .LBB13_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB13_3: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a2) +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB13_2 +; RV64ZVE32F-NEXT: .LBB13_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v8, (a3) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i64> %val to <2 x i16> call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %tval, <2 x i16*> %ptrs, i32 2, <2 x i1> %m) ret void @@ -284,6 +979,64 @@ define void @mscatter_v4i16(<4 x i16> %val, <4 x i16*> %ptrs, <4 x i1> %m) { ; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v4i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a1) +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: bnez a5, .LBB14_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB14_6 +; RV64ZVE32F-NEXT: .LBB14_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB14_7 +; RV64ZVE32F-NEXT: .LBB14_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB14_8 +; RV64ZVE32F-NEXT: .LBB14_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB14_5: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB14_2 +; RV64ZVE32F-NEXT: .LBB14_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB14_3 +; RV64ZVE32F-NEXT: .LBB14_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v9, (a3) +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB14_4 +; RV64ZVE32F-NEXT: .LBB14_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %val, <4 x i16*> %ptrs, i32 2, <4 x i1> %m) ret void } @@ -300,6 +1053,64 @@ define void @mscatter_truemask_v4i16(<4 x i16> %val, <4 x i16*> %ptrs) { ; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10 ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_truemask_v4i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 8(a0) +; RV64ZVE32F-NEXT: ld a4, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB15_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB15_6 +; RV64ZVE32F-NEXT: .LBB15_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB15_7 +; RV64ZVE32F-NEXT: .LBB15_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB15_8 +; RV64ZVE32F-NEXT: .LBB15_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB15_5: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a4) +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB15_2 +; RV64ZVE32F-NEXT: .LBB15_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v9, (a3) +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB15_3 +; RV64ZVE32F-NEXT: .LBB15_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB15_4 +; RV64ZVE32F-NEXT: .LBB15_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %val, <4 x i16*> %ptrs, i32 2, <4 x i1> %mtrue) @@ -328,6 +1139,90 @@ define void @mscatter_v8i16(<8 x i16> %val, <8 x i16*> %ptrs, <8 x i1> %m) { ; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v8i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a1, 56(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi t1, a3, 1 +; RV64ZVE32F-NEXT: bnez t1, .LBB17_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB17_10 +; RV64ZVE32F-NEXT: .LBB17_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB17_11 +; RV64ZVE32F-NEXT: .LBB17_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB17_12 +; RV64ZVE32F-NEXT: .LBB17_4: # %else6 +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB17_13 +; RV64ZVE32F-NEXT: .LBB17_5: # %else8 +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB17_14 +; RV64ZVE32F-NEXT: .LBB17_6: # %else10 +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: bnez a0, .LBB17_15 +; RV64ZVE32F-NEXT: .LBB17_7: # %else12 +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB17_16 +; RV64ZVE32F-NEXT: .LBB17_8: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB17_9: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB17_2 +; RV64ZVE32F-NEXT: .LBB17_10: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v9, (t0) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB17_3 +; RV64ZVE32F-NEXT: .LBB17_11: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v9, (a7) +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB17_4 +; RV64ZVE32F-NEXT: .LBB17_12: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a6) +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB17_5 +; RV64ZVE32F-NEXT: .LBB17_13: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a5) +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB17_6 +; RV64ZVE32F-NEXT: .LBB17_14: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: beqz a0, .LBB17_7 +; RV64ZVE32F-NEXT: .LBB17_15: # %cond.store11 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 6 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB17_8 +; RV64ZVE32F-NEXT: .LBB17_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m) ret void } @@ -350,6 +1245,114 @@ define void @mscatter_baseidx_v8i8_v8i16(<8 x i16> %val, i16* %base, <8 x i8> %i ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8_v8i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB18_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB18_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB18_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB18_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB18_14 +; RV64ZVE32F-NEXT: .LBB18_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_10 +; RV64ZVE32F-NEXT: .LBB18_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB18_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB18_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB18_16 +; RV64ZVE32F-NEXT: .LBB18_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB18_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_8 +; RV64ZVE32F-NEXT: .LBB18_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB18_9 +; RV64ZVE32F-NEXT: j .LBB18_10 +; RV64ZVE32F-NEXT: .LBB18_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB18_12 +; RV64ZVE32F-NEXT: .LBB18_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, i16* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m) ret void @@ -373,6 +1376,114 @@ define void @mscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, i16* %base, <8 x i ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB19_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB19_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB19_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB19_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB19_14 +; RV64ZVE32F-NEXT: .LBB19_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_10 +; RV64ZVE32F-NEXT: .LBB19_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB19_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB19_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB19_16 +; RV64ZVE32F-NEXT: .LBB19_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB19_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_8 +; RV64ZVE32F-NEXT: .LBB19_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB19_9 +; RV64ZVE32F-NEXT: j .LBB19_10 +; RV64ZVE32F-NEXT: .LBB19_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB19_12 +; RV64ZVE32F-NEXT: .LBB19_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %eidxs call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m) @@ -397,6 +1508,122 @@ define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, i16* %base, <8 x i ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB20_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB20_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB20_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB20_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB20_14 +; RV64ZVE32F-NEXT: .LBB20_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_10 +; RV64ZVE32F-NEXT: .LBB20_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB20_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB20_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB20_16 +; RV64ZVE32F-NEXT: .LBB20_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB20_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_8 +; RV64ZVE32F-NEXT: .LBB20_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB20_9 +; RV64ZVE32F-NEXT: j .LBB20_10 +; RV64ZVE32F-NEXT: .LBB20_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB20_12 +; RV64ZVE32F-NEXT: .LBB20_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 255 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %eidxs call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m) @@ -421,6 +1648,115 @@ define void @mscatter_baseidx_v8i16(<8 x i16> %val, i16* %base, <8 x i16> %idxs, ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB21_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB21_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB21_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB21_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB21_14 +; RV64ZVE32F-NEXT: .LBB21_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_10 +; RV64ZVE32F-NEXT: .LBB21_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB21_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB21_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB21_16 +; RV64ZVE32F-NEXT: .LBB21_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB21_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_8 +; RV64ZVE32F-NEXT: .LBB21_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB21_9 +; RV64ZVE32F-NEXT: j .LBB21_10 +; RV64ZVE32F-NEXT: .LBB21_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB21_12 +; RV64ZVE32F-NEXT: .LBB21_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %idxs call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %val, <8 x i16*> %ptrs, i32 2, <8 x i1> %m) ret void @@ -429,17 +1765,37 @@ define void @mscatter_baseidx_v8i16(<8 x i16> %val, i16* %base, <8 x i16> %idxs, declare void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32>, <1 x i32*>, i32, <1 x i1>) define void @mscatter_v1i32(<1 x i32> %val, <1 x i32*> %ptrs, <1 x i1> %m) { -; RV32-LABEL: mscatter_v1i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v1i32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v1i32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v1i32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v1i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB22_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: .LBB22_2: # %else +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> %val, <1 x i32*> %ptrs, i32 4, <1 x i1> %m) ret void } @@ -447,28 +1803,70 @@ define void @mscatter_v1i32(<1 x i32> %val, <1 x i32*> %ptrs, <1 x i1> %m) { declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32>, <2 x i32*>, i32, <2 x i1>) define void @mscatter_v2i32(<2 x i32> %val, <2 x i32*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2i32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2i32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2i32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB23_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB23_4 +; RV64ZVE32F-NEXT: .LBB23_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB23_3: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB23_2 +; RV64ZVE32F-NEXT: .LBB23_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %val, <2 x i32*> %ptrs, i32 4, <2 x i1> %m) ret void } define void @mscatter_v2i64_truncstore_v2i32(<2 x i64> %val, <2 x i32*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2i64_truncstore_v2i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vncvt.x.x.w v8, v8 -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2i64_truncstore_v2i32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v8, v8 +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2i64_truncstore_v2i32: ; RV64: # %bb.0: @@ -476,6 +1874,62 @@ define void @mscatter_v2i64_truncstore_v2i32(<2 x i64> %val, <2 x i32*> %ptrs, < ; RV64-NEXT: vncvt.x.x.w v8, v8 ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: lw a1, 0(a0) +; RV32ZVE32F-NEXT: addi a0, a0, 8 +; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vlse32.v v9, (a0), zero +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV32ZVE32F-NEXT: vmv.s.x v9, a1 +; RV32ZVE32F-NEXT: vsoxei32.v v9, (zero), v8, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -32 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 32 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a4, sp, 31 +; RV64ZVE32F-NEXT: vsm.v v8, (a4) +; RV64ZVE32F-NEXT: lbu a4, 31(sp) +; RV64ZVE32F-NEXT: sw a1, 24(sp) +; RV64ZVE32F-NEXT: sw a0, 20(sp) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: addi a0, sp, 24 +; RV64ZVE32F-NEXT: vle32.v v9, (a0) +; RV64ZVE32F-NEXT: addi a0, sp, 20 +; RV64ZVE32F-NEXT: vle32.v v8, (a0) +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; RV64ZVE32F-NEXT: andi a0, a4, 1 +; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: bnez a0, .LBB24_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB24_4 +; RV64ZVE32F-NEXT: .LBB24_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 32 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB24_3: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB24_2 +; RV64ZVE32F-NEXT: .LBB24_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v8, (a3) +; RV64ZVE32F-NEXT: addi sp, sp, 32 +; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i64> %val to <2 x i32> call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %tval, <2 x i32*> %ptrs, i32 4, <2 x i1> %m) ret void @@ -495,6 +1949,64 @@ define void @mscatter_v4i32(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %m) { ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v4i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a1) +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: bnez a5, .LBB25_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB25_6 +; RV64ZVE32F-NEXT: .LBB25_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB25_7 +; RV64ZVE32F-NEXT: .LBB25_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB25_8 +; RV64ZVE32F-NEXT: .LBB25_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB25_5: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB25_2 +; RV64ZVE32F-NEXT: .LBB25_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB25_3 +; RV64ZVE32F-NEXT: .LBB25_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v9, (a3) +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB25_4 +; RV64ZVE32F-NEXT: .LBB25_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 4, <4 x i1> %m) ret void } @@ -511,6 +2023,64 @@ define void @mscatter_truemask_v4i32(<4 x i32> %val, <4 x i32*> %ptrs) { ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10 ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_truemask_v4i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 8(a0) +; RV64ZVE32F-NEXT: ld a4, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB26_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB26_6 +; RV64ZVE32F-NEXT: .LBB26_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB26_7 +; RV64ZVE32F-NEXT: .LBB26_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB26_8 +; RV64ZVE32F-NEXT: .LBB26_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB26_5: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a4) +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB26_2 +; RV64ZVE32F-NEXT: .LBB26_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v9, (a3) +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB26_3 +; RV64ZVE32F-NEXT: .LBB26_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB26_4 +; RV64ZVE32F-NEXT: .LBB26_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 4, <4 x i1> %mtrue) @@ -539,6 +2109,90 @@ define void @mscatter_v8i32(<8 x i32> %val, <8 x i32*> %ptrs, <8 x i1> %m) { ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a1, 56(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi t1, a3, 1 +; RV64ZVE32F-NEXT: bnez t1, .LBB28_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB28_10 +; RV64ZVE32F-NEXT: .LBB28_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB28_11 +; RV64ZVE32F-NEXT: .LBB28_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB28_12 +; RV64ZVE32F-NEXT: .LBB28_4: # %else6 +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB28_13 +; RV64ZVE32F-NEXT: .LBB28_5: # %else8 +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB28_14 +; RV64ZVE32F-NEXT: .LBB28_6: # %else10 +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: bnez a0, .LBB28_15 +; RV64ZVE32F-NEXT: .LBB28_7: # %else12 +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB28_16 +; RV64ZVE32F-NEXT: .LBB28_8: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB28_9: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB28_2 +; RV64ZVE32F-NEXT: .LBB28_10: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v10, (t0) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB28_3 +; RV64ZVE32F-NEXT: .LBB28_11: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v10, (a7) +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB28_4 +; RV64ZVE32F-NEXT: .LBB28_12: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v10, (a6) +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB28_5 +; RV64ZVE32F-NEXT: .LBB28_13: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v10, (a5) +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB28_6 +; RV64ZVE32F-NEXT: .LBB28_14: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v10, (a4) +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: beqz a0, .LBB28_7 +; RV64ZVE32F-NEXT: .LBB28_15: # %cond.store11 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB28_8 +; RV64ZVE32F-NEXT: .LBB28_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) ret void } @@ -560,6 +2214,114 @@ define void @mscatter_baseidx_v8i8_v8i32(<8 x i32> %val, i32* %base, <8 x i8> %i ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB29_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB29_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB29_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB29_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB29_14 +; RV64ZVE32F-NEXT: .LBB29_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_10 +; RV64ZVE32F-NEXT: .LBB29_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB29_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB29_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB29_16 +; RV64ZVE32F-NEXT: .LBB29_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB29_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_8 +; RV64ZVE32F-NEXT: .LBB29_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB29_9 +; RV64ZVE32F-NEXT: j .LBB29_10 +; RV64ZVE32F-NEXT: .LBB29_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB29_12 +; RV64ZVE32F-NEXT: .LBB29_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i32, i32* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) ret void @@ -582,6 +2344,114 @@ define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, i32* %base, <8 x i ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB30_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB30_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB30_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB30_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB30_14 +; RV64ZVE32F-NEXT: .LBB30_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_10 +; RV64ZVE32F-NEXT: .LBB30_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB30_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB30_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB30_16 +; RV64ZVE32F-NEXT: .LBB30_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB30_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_8 +; RV64ZVE32F-NEXT: .LBB30_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB30_9 +; RV64ZVE32F-NEXT: j .LBB30_10 +; RV64ZVE32F-NEXT: .LBB30_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB30_12 +; RV64ZVE32F-NEXT: .LBB30_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) @@ -605,6 +2475,122 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, i32* %base, <8 x i ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB31_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB31_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB31_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_14 +; RV64ZVE32F-NEXT: .LBB31_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_10 +; RV64ZVE32F-NEXT: .LBB31_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB31_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB31_16 +; RV64ZVE32F-NEXT: .LBB31_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB31_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_8 +; RV64ZVE32F-NEXT: .LBB31_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_9 +; RV64ZVE32F-NEXT: j .LBB31_10 +; RV64ZVE32F-NEXT: .LBB31_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB31_12 +; RV64ZVE32F-NEXT: .LBB31_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: andi a1, a1, 255 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) @@ -628,6 +2614,115 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, i32* %base, <8 x i16> ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i16_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB32_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB32_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB32_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB32_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB32_14 +; RV64ZVE32F-NEXT: .LBB32_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_10 +; RV64ZVE32F-NEXT: .LBB32_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB32_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB32_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB32_16 +; RV64ZVE32F-NEXT: .LBB32_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB32_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_8 +; RV64ZVE32F-NEXT: .LBB32_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB32_9 +; RV64ZVE32F-NEXT: j .LBB32_10 +; RV64ZVE32F-NEXT: .LBB32_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB32_12 +; RV64ZVE32F-NEXT: .LBB32_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i32, i32* %base, <8 x i16> %idxs call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) ret void @@ -650,6 +2745,115 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, i32* %base, <8 x ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB33_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB33_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB33_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB33_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB33_14 +; RV64ZVE32F-NEXT: .LBB33_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_10 +; RV64ZVE32F-NEXT: .LBB33_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB33_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB33_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB33_16 +; RV64ZVE32F-NEXT: .LBB33_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB33_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_8 +; RV64ZVE32F-NEXT: .LBB33_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB33_9 +; RV64ZVE32F-NEXT: j .LBB33_10 +; RV64ZVE32F-NEXT: .LBB33_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB33_12 +; RV64ZVE32F-NEXT: .LBB33_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) @@ -673,6 +2877,125 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, i32* %base, <8 x ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: lui a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: addiw a1, a1, -1 +; RV64ZVE32F-NEXT: beqz a3, .LBB34_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a3) +; RV64ZVE32F-NEXT: .LBB34_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB34_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: .LBB34_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB34_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: .LBB34_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB34_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB34_14 +; RV64ZVE32F-NEXT: .LBB34_8: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB34_10 +; RV64ZVE32F-NEXT: .LBB34_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: .LBB34_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB34_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: bnez a2, .LBB34_16 +; RV64ZVE32F-NEXT: .LBB34_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB34_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB34_8 +; RV64ZVE32F-NEXT: .LBB34_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB34_9 +; RV64ZVE32F-NEXT: j .LBB34_10 +; RV64ZVE32F-NEXT: .LBB34_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_12 +; RV64ZVE32F-NEXT: .LBB34_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: and a1, a2, a1 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %eidxs call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) @@ -695,6 +3018,117 @@ define void @mscatter_baseidx_v8i32(<8 x i32> %val, i32* %base, <8 x i32> %idxs, ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB35_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB35_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_13 +; RV64ZVE32F-NEXT: .LBB35_6: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_14 +; RV64ZVE32F-NEXT: .LBB35_7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_9 +; RV64ZVE32F-NEXT: .LBB35_8: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB35_9: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_15 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB35_16 +; RV64ZVE32F-NEXT: .LBB35_11: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB35_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_6 +; RV64ZVE32F-NEXT: .LBB35_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_7 +; RV64ZVE32F-NEXT: .LBB35_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_8 +; RV64ZVE32F-NEXT: j .LBB35_9 +; RV64ZVE32F-NEXT: .LBB35_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB35_11 +; RV64ZVE32F-NEXT: .LBB35_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %idxs call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %val, <8 x i32*> %ptrs, i32 4, <8 x i1> %m) ret void @@ -703,17 +3137,46 @@ define void @mscatter_baseidx_v8i32(<8 x i32> %val, i32* %base, <8 x i32> %idxs, declare void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64>, <1 x i64*>, i32, <1 x i1>) define void @mscatter_v1i64(<1 x i64> %val, <1 x i64*> %ptrs, <1 x i1> %m) { -; RV32-LABEL: mscatter_v1i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v1i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v1i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v1i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetvli a2, zero, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vmv.x.s a2, v9 +; RV32ZVE32F-NEXT: andi a2, a2, 1 +; RV32ZVE32F-NEXT: beqz a2, .LBB36_2 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.store +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a2, v8 +; RV32ZVE32F-NEXT: sw a1, 4(a2) +; RV32ZVE32F-NEXT: sw a0, 0(a2) +; RV32ZVE32F-NEXT: .LBB36_2: # %else +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v1i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a2, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: sd a0, 0(a1) +; RV64ZVE32F-NEXT: .LBB36_2: # %else +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> %val, <1 x i64*> %ptrs, i32 8, <1 x i1> %m) ret void } @@ -721,17 +3184,94 @@ define void @mscatter_v1i64(<1 x i64> %val, <1 x i64*> %ptrs, <1 x i1> %m) { declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64>, <2 x i64*>, i32, <2 x i1>) define void @mscatter_v2i64(<2 x i64> %val, <2 x i64*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi a1, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (a1) +; RV32ZVE32F-NEXT: lbu a3, 15(sp) +; RV32ZVE32F-NEXT: lw a2, 12(a0) +; RV32ZVE32F-NEXT: lw a1, 8(a0) +; RV32ZVE32F-NEXT: andi a4, a3, 1 +; RV32ZVE32F-NEXT: bnez a4, .LBB37_3 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a3, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB37_4 +; RV32ZVE32F-NEXT: .LBB37_2: # %else2 +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB37_3: # %cond.store +; RV32ZVE32F-NEXT: lw a4, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a5, v8 +; RV32ZVE32F-NEXT: sw a4, 4(a5) +; RV32ZVE32F-NEXT: sw a0, 0(a5) +; RV32ZVE32F-NEXT: andi a0, a3, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB37_2 +; RV32ZVE32F-NEXT: .LBB37_4: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: sw a1, 0(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a4, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a4) +; RV64ZVE32F-NEXT: lbu a4, 15(sp) +; RV64ZVE32F-NEXT: andi a5, a4, 1 +; RV64ZVE32F-NEXT: bnez a5, .LBB37_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB37_4 +; RV64ZVE32F-NEXT: .LBB37_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB37_3: # %cond.store +; RV64ZVE32F-NEXT: sd a0, 0(a2) +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB37_2 +; RV64ZVE32F-NEXT: .LBB37_4: # %cond.store1 +; RV64ZVE32F-NEXT: sd a1, 0(a3) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %val, <2 x i64*> %ptrs, i32 8, <2 x i1> %m) ret void } @@ -739,33 +3279,283 @@ define void @mscatter_v2i64(<2 x i64> %val, <2 x i64*> %ptrs, <2 x i1> %m) { declare void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64>, <4 x i64*>, i32, <4 x i1>) define void @mscatter_v4i64(<4 x i64> %val, <4 x i64*> %ptrs, <4 x i1> %m) { -; RV32-LABEL: mscatter_v4i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v4i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v4i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v4i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi a1, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (a1) +; RV32ZVE32F-NEXT: lw a1, 28(a0) +; RV32ZVE32F-NEXT: lw a2, 24(a0) +; RV32ZVE32F-NEXT: lw a3, 20(a0) +; RV32ZVE32F-NEXT: lw a4, 16(a0) +; RV32ZVE32F-NEXT: lbu a5, 15(sp) +; RV32ZVE32F-NEXT: lw a7, 12(a0) +; RV32ZVE32F-NEXT: lw a6, 8(a0) +; RV32ZVE32F-NEXT: andi t0, a5, 1 +; RV32ZVE32F-NEXT: bnez t0, .LBB38_5 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a5, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB38_6 +; RV32ZVE32F-NEXT: .LBB38_2: # %else2 +; RV32ZVE32F-NEXT: andi a0, a5, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB38_7 +; RV32ZVE32F-NEXT: .LBB38_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a5, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB38_8 +; RV32ZVE32F-NEXT: .LBB38_4: # %else6 +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB38_5: # %cond.store +; RV32ZVE32F-NEXT: lw t0, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t1, v8 +; RV32ZVE32F-NEXT: sw t0, 4(t1) +; RV32ZVE32F-NEXT: sw a0, 0(t1) +; RV32ZVE32F-NEXT: andi a0, a5, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB38_2 +; RV32ZVE32F-NEXT: .LBB38_6: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 +; RV32ZVE32F-NEXT: sw a7, 4(a0) +; RV32ZVE32F-NEXT: sw a6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a5, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB38_3 +; RV32ZVE32F-NEXT: .LBB38_7: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a5, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB38_4 +; RV32ZVE32F-NEXT: .LBB38_8: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v4i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a2) +; RV64ZVE32F-NEXT: ld a2, 24(a1) +; RV64ZVE32F-NEXT: ld a4, 16(a1) +; RV64ZVE32F-NEXT: ld a7, 8(a1) +; RV64ZVE32F-NEXT: ld a3, 24(a0) +; RV64ZVE32F-NEXT: lbu a5, 15(sp) +; RV64ZVE32F-NEXT: ld a6, 16(a0) +; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: andi t1, a5, 1 +; RV64ZVE32F-NEXT: bnez t1, .LBB38_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB38_6 +; RV64ZVE32F-NEXT: .LBB38_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB38_7 +; RV64ZVE32F-NEXT: .LBB38_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a5, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB38_8 +; RV64ZVE32F-NEXT: .LBB38_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB38_5: # %cond.store +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: sd a0, 0(a1) +; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB38_2 +; RV64ZVE32F-NEXT: .LBB38_6: # %cond.store1 +; RV64ZVE32F-NEXT: sd t0, 0(a7) +; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB38_3 +; RV64ZVE32F-NEXT: .LBB38_7: # %cond.store3 +; RV64ZVE32F-NEXT: sd a6, 0(a4) +; RV64ZVE32F-NEXT: andi a0, a5, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB38_4 +; RV64ZVE32F-NEXT: .LBB38_8: # %cond.store5 +; RV64ZVE32F-NEXT: sd a3, 0(a2) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> %val, <4 x i64*> %ptrs, i32 8, <4 x i1> %m) ret void } define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x i64*> %ptrs) { -; RV32-LABEL: mscatter_truemask_v4i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v10 -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_truemask_v4i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v10 +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_truemask_v4i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10 ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_truemask_v4i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: lw a1, 28(a0) +; RV32ZVE32F-NEXT: lw a2, 24(a0) +; RV32ZVE32F-NEXT: lw a3, 20(a0) +; RV32ZVE32F-NEXT: lw a4, 16(a0) +; RV32ZVE32F-NEXT: lw a6, 12(a0) +; RV32ZVE32F-NEXT: lw a5, 8(a0) +; RV32ZVE32F-NEXT: lw t0, 4(a0) +; RV32ZVE32F-NEXT: lw a7, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmset.m v0 +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi a0, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (a0) +; RV32ZVE32F-NEXT: lb a0, 15(sp) +; RV32ZVE32F-NEXT: beqz zero, .LBB39_5 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a7, a0, 2 +; RV32ZVE32F-NEXT: bnez a7, .LBB39_6 +; RV32ZVE32F-NEXT: .LBB39_2: # %else2 +; RV32ZVE32F-NEXT: andi a5, a0, 4 +; RV32ZVE32F-NEXT: bnez a5, .LBB39_7 +; RV32ZVE32F-NEXT: .LBB39_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a0, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB39_8 +; RV32ZVE32F-NEXT: .LBB39_4: # %else6 +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB39_5: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s t1, v8 +; RV32ZVE32F-NEXT: sw t0, 4(t1) +; RV32ZVE32F-NEXT: sw a7, 0(t1) +; RV32ZVE32F-NEXT: andi a7, a0, 2 +; RV32ZVE32F-NEXT: beqz a7, .LBB39_2 +; RV32ZVE32F-NEXT: .LBB39_6: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a7, v9 +; RV32ZVE32F-NEXT: sw a6, 4(a7) +; RV32ZVE32F-NEXT: sw a5, 0(a7) +; RV32ZVE32F-NEXT: andi a5, a0, 4 +; RV32ZVE32F-NEXT: beqz a5, .LBB39_3 +; RV32ZVE32F-NEXT: .LBB39_7: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a5, v9 +; RV32ZVE32F-NEXT: sw a4, 0(a5) +; RV32ZVE32F-NEXT: sw a3, 4(a5) +; RV32ZVE32F-NEXT: andi a0, a0, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB39_4 +; RV32ZVE32F-NEXT: .LBB39_8: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_truemask_v4i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a2, 24(a1) +; RV64ZVE32F-NEXT: ld a3, 16(a1) +; RV64ZVE32F-NEXT: ld a5, 8(a1) +; RV64ZVE32F-NEXT: ld a7, 0(a1) +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a4, 16(a0) +; RV64ZVE32F-NEXT: ld a6, 8(a0) +; RV64ZVE32F-NEXT: ld t0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB39_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a7, a0, 2 +; RV64ZVE32F-NEXT: bnez a7, .LBB39_6 +; RV64ZVE32F-NEXT: .LBB39_2: # %else2 +; RV64ZVE32F-NEXT: andi a5, a0, 4 +; RV64ZVE32F-NEXT: bnez a5, .LBB39_7 +; RV64ZVE32F-NEXT: .LBB39_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB39_8 +; RV64ZVE32F-NEXT: .LBB39_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB39_5: # %cond.store +; RV64ZVE32F-NEXT: sd t0, 0(a7) +; RV64ZVE32F-NEXT: andi a7, a0, 2 +; RV64ZVE32F-NEXT: beqz a7, .LBB39_2 +; RV64ZVE32F-NEXT: .LBB39_6: # %cond.store1 +; RV64ZVE32F-NEXT: sd a6, 0(a5) +; RV64ZVE32F-NEXT: andi a5, a0, 4 +; RV64ZVE32F-NEXT: beqz a5, .LBB39_3 +; RV64ZVE32F-NEXT: .LBB39_7: # %cond.store3 +; RV64ZVE32F-NEXT: sd a4, 0(a3) +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB39_4 +; RV64ZVE32F-NEXT: .LBB39_8: # %cond.store5 +; RV64ZVE32F-NEXT: sd a1, 0(a2) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> %val, <4 x i64*> %ptrs, i32 8, <4 x i1> %mtrue) @@ -783,30 +3573,239 @@ define void @mscatter_falsemask_v4i64(<4 x i64> %val, <4 x i64*> %ptrs) { declare void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64>, <8 x i64*>, i32, <8 x i1>) define void @mscatter_v8i64(<8 x i64> %val, <8 x i64*> %ptrs, <8 x i1> %m) { -; RV32-LABEL: mscatter_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: lw a1, 60(a0) +; RV32ZVE32F-NEXT: lw a2, 56(a0) +; RV32ZVE32F-NEXT: lw a3, 52(a0) +; RV32ZVE32F-NEXT: lw a4, 48(a0) +; RV32ZVE32F-NEXT: lw a5, 44(a0) +; RV32ZVE32F-NEXT: lw a7, 40(a0) +; RV32ZVE32F-NEXT: lw t0, 36(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 28(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 20(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a6, v0 +; RV32ZVE32F-NEXT: andi s1, a6, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB41_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a6, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB41_11 +; RV32ZVE32F-NEXT: .LBB41_2: # %else2 +; RV32ZVE32F-NEXT: andi a0, a6, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB41_12 +; RV32ZVE32F-NEXT: .LBB41_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a6, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB41_13 +; RV32ZVE32F-NEXT: .LBB41_4: # %else6 +; RV32ZVE32F-NEXT: andi a0, a6, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB41_14 +; RV32ZVE32F-NEXT: .LBB41_5: # %else8 +; RV32ZVE32F-NEXT: andi a0, a6, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB41_15 +; RV32ZVE32F-NEXT: .LBB41_6: # %else10 +; RV32ZVE32F-NEXT: andi a0, a6, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB41_16 +; RV32ZVE32F-NEXT: .LBB41_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a6, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB41_9 +; RV32ZVE32F-NEXT: .LBB41_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a2, 0(a0) +; RV32ZVE32F-NEXT: sw a1, 4(a0) +; RV32ZVE32F-NEXT: .LBB41_9: # %else14 +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB41_10: # %cond.store +; RV32ZVE32F-NEXT: lw s1, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw s1, 4(s2) +; RV32ZVE32F-NEXT: sw a0, 0(s2) +; RV32ZVE32F-NEXT: andi a0, a6, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB41_2 +; RV32ZVE32F-NEXT: .LBB41_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a6, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB41_3 +; RV32ZVE32F-NEXT: .LBB41_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a6, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB41_4 +; RV32ZVE32F-NEXT: .LBB41_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a6, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB41_5 +; RV32ZVE32F-NEXT: .LBB41_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a6, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB41_6 +; RV32ZVE32F-NEXT: .LBB41_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a7, 0(a0) +; RV32ZVE32F-NEXT: sw a5, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a6, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB41_7 +; RV32ZVE32F-NEXT: .LBB41_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a6, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB41_8 +; RV32ZVE32F-NEXT: j .LBB41_9 +; +; RV64ZVE32F-LABEL: mscatter_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -32 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 32 +; RV64ZVE32F-NEXT: sd s0, 24(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s1, 16(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s2, 8(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: .cfi_offset s0, -8 +; RV64ZVE32F-NEXT: .cfi_offset s1, -16 +; RV64ZVE32F-NEXT: .cfi_offset s2, -24 +; RV64ZVE32F-NEXT: ld a2, 56(a1) +; RV64ZVE32F-NEXT: ld a4, 48(a1) +; RV64ZVE32F-NEXT: ld a6, 40(a1) +; RV64ZVE32F-NEXT: ld t1, 32(a1) +; RV64ZVE32F-NEXT: ld t3, 24(a1) +; RV64ZVE32F-NEXT: ld t5, 16(a1) +; RV64ZVE32F-NEXT: ld s0, 8(a1) +; RV64ZVE32F-NEXT: ld a3, 56(a0) +; RV64ZVE32F-NEXT: ld a5, 48(a0) +; RV64ZVE32F-NEXT: ld t0, 40(a0) +; RV64ZVE32F-NEXT: ld t2, 32(a0) +; RV64ZVE32F-NEXT: ld t4, 24(a0) +; RV64ZVE32F-NEXT: ld t6, 16(a0) +; RV64ZVE32F-NEXT: ld s1, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a7, v0 +; RV64ZVE32F-NEXT: andi s2, a7, 1 +; RV64ZVE32F-NEXT: bnez s2, .LBB41_10 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a7, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB41_11 +; RV64ZVE32F-NEXT: .LBB41_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a7, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB41_12 +; RV64ZVE32F-NEXT: .LBB41_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a7, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB41_13 +; RV64ZVE32F-NEXT: .LBB41_4: # %else6 +; RV64ZVE32F-NEXT: andi a0, a7, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB41_14 +; RV64ZVE32F-NEXT: .LBB41_5: # %else8 +; RV64ZVE32F-NEXT: andi a0, a7, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB41_15 +; RV64ZVE32F-NEXT: .LBB41_6: # %else10 +; RV64ZVE32F-NEXT: andi a0, a7, 64 +; RV64ZVE32F-NEXT: bnez a0, .LBB41_16 +; RV64ZVE32F-NEXT: .LBB41_7: # %else12 +; RV64ZVE32F-NEXT: andi a0, a7, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB41_9 +; RV64ZVE32F-NEXT: .LBB41_8: # %cond.store13 +; RV64ZVE32F-NEXT: sd a3, 0(a2) +; RV64ZVE32F-NEXT: .LBB41_9: # %else14 +; RV64ZVE32F-NEXT: ld s0, 24(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s1, 16(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s2, 8(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: addi sp, sp, 32 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB41_10: # %cond.store +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: sd a0, 0(a1) +; RV64ZVE32F-NEXT: andi a0, a7, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB41_2 +; RV64ZVE32F-NEXT: .LBB41_11: # %cond.store1 +; RV64ZVE32F-NEXT: sd s1, 0(s0) +; RV64ZVE32F-NEXT: andi a0, a7, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB41_3 +; RV64ZVE32F-NEXT: .LBB41_12: # %cond.store3 +; RV64ZVE32F-NEXT: sd t6, 0(t5) +; RV64ZVE32F-NEXT: andi a0, a7, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB41_4 +; RV64ZVE32F-NEXT: .LBB41_13: # %cond.store5 +; RV64ZVE32F-NEXT: sd t4, 0(t3) +; RV64ZVE32F-NEXT: andi a0, a7, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB41_5 +; RV64ZVE32F-NEXT: .LBB41_14: # %cond.store7 +; RV64ZVE32F-NEXT: sd t2, 0(t1) +; RV64ZVE32F-NEXT: andi a0, a7, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB41_6 +; RV64ZVE32F-NEXT: .LBB41_15: # %cond.store9 +; RV64ZVE32F-NEXT: sd t0, 0(a6) +; RV64ZVE32F-NEXT: andi a0, a7, 64 +; RV64ZVE32F-NEXT: beqz a0, .LBB41_7 +; RV64ZVE32F-NEXT: .LBB41_16: # %cond.store11 +; RV64ZVE32F-NEXT: sd a5, 0(a4) +; RV64ZVE32F-NEXT: andi a0, a7, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB41_8 +; RV64ZVE32F-NEXT: j .LBB41_9 call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m) ret void } define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i8> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_v8i8_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsext.vf4 v14, v12 -; RV32-NEXT: vsll.vi v12, v14, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_v8i8_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsext.vf4 v14, v12 +; RV32V-NEXT: vsll.vi v12, v14, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_v8i8_v8i64: ; RV64: # %bb.0: @@ -815,22 +3814,247 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i8> %i ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_v8i8_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: lw a2, 60(a0) +; RV32ZVE32F-NEXT: lw a3, 56(a0) +; RV32ZVE32F-NEXT: lw a4, 52(a0) +; RV32ZVE32F-NEXT: lw a5, 48(a0) +; RV32ZVE32F-NEXT: lw a6, 44(a0) +; RV32ZVE32F-NEXT: lw a7, 40(a0) +; RV32ZVE32F-NEXT: lw t0, 36(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 28(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 20(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB42_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB42_11 +; RV32ZVE32F-NEXT: .LBB42_2: # %else2 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB42_12 +; RV32ZVE32F-NEXT: .LBB42_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB42_13 +; RV32ZVE32F-NEXT: .LBB42_4: # %else6 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB42_14 +; RV32ZVE32F-NEXT: .LBB42_5: # %else8 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB42_15 +; RV32ZVE32F-NEXT: .LBB42_6: # %else10 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB42_16 +; RV32ZVE32F-NEXT: .LBB42_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB42_9 +; RV32ZVE32F-NEXT: .LBB42_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: .LBB42_9: # %else14 +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB42_10: # %cond.store +; RV32ZVE32F-NEXT: lw s1, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw s1, 4(s2) +; RV32ZVE32F-NEXT: sw a0, 0(s2) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB42_2 +; RV32ZVE32F-NEXT: .LBB42_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB42_3 +; RV32ZVE32F-NEXT: .LBB42_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB42_4 +; RV32ZVE32F-NEXT: .LBB42_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB42_5 +; RV32ZVE32F-NEXT: .LBB42_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB42_6 +; RV32ZVE32F-NEXT: .LBB42_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a7, 0(a0) +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB42_7 +; RV32ZVE32F-NEXT: .LBB42_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB42_8 +; RV32ZVE32F-NEXT: j .LBB42_9 +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a2, 56(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 +; RV64ZVE32F-NEXT: beqz t2, .LBB42_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: sd a0, 0(t2) +; RV64ZVE32F-NEXT: .LBB42_2: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t1, 0(a0) +; RV64ZVE32F-NEXT: .LBB42_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB42_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB42_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB42_14 +; RV64ZVE32F-NEXT: .LBB42_8: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_10 +; RV64ZVE32F-NEXT: .LBB42_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB42_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB42_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB42_16 +; RV64ZVE32F-NEXT: .LBB42_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB42_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_8 +; RV64ZVE32F-NEXT: .LBB42_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB42_9 +; RV64ZVE32F-NEXT: j .LBB42_10 +; RV64ZVE32F-NEXT: .LBB42_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_12 +; RV64ZVE32F-NEXT: .LBB42_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a2, 0(a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i64, i64* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m) ret void } define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i8> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_sext_v8i8_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf8 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_sext_v8i8_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf8 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_sext_v8i8_v8i64: ; RV64: # %bb.0: @@ -839,6 +4063,231 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: lw a2, 60(a0) +; RV32ZVE32F-NEXT: lw a3, 56(a0) +; RV32ZVE32F-NEXT: lw a4, 52(a0) +; RV32ZVE32F-NEXT: lw a5, 48(a0) +; RV32ZVE32F-NEXT: lw a6, 44(a0) +; RV32ZVE32F-NEXT: lw a7, 40(a0) +; RV32ZVE32F-NEXT: lw t0, 36(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 28(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 20(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB43_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB43_11 +; RV32ZVE32F-NEXT: .LBB43_2: # %else2 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB43_12 +; RV32ZVE32F-NEXT: .LBB43_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB43_13 +; RV32ZVE32F-NEXT: .LBB43_4: # %else6 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB43_14 +; RV32ZVE32F-NEXT: .LBB43_5: # %else8 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB43_15 +; RV32ZVE32F-NEXT: .LBB43_6: # %else10 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB43_16 +; RV32ZVE32F-NEXT: .LBB43_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB43_9 +; RV32ZVE32F-NEXT: .LBB43_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: .LBB43_9: # %else14 +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB43_10: # %cond.store +; RV32ZVE32F-NEXT: lw s1, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw s1, 4(s2) +; RV32ZVE32F-NEXT: sw a0, 0(s2) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB43_2 +; RV32ZVE32F-NEXT: .LBB43_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB43_3 +; RV32ZVE32F-NEXT: .LBB43_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB43_4 +; RV32ZVE32F-NEXT: .LBB43_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB43_5 +; RV32ZVE32F-NEXT: .LBB43_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB43_6 +; RV32ZVE32F-NEXT: .LBB43_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a7, 0(a0) +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB43_7 +; RV32ZVE32F-NEXT: .LBB43_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB43_8 +; RV32ZVE32F-NEXT: j .LBB43_9 +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a2, 56(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 +; RV64ZVE32F-NEXT: beqz t2, .LBB43_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: sd a0, 0(t2) +; RV64ZVE32F-NEXT: .LBB43_2: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t1, 0(a0) +; RV64ZVE32F-NEXT: .LBB43_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB43_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB43_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB43_14 +; RV64ZVE32F-NEXT: .LBB43_8: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_10 +; RV64ZVE32F-NEXT: .LBB43_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB43_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB43_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB43_16 +; RV64ZVE32F-NEXT: .LBB43_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB43_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_8 +; RV64ZVE32F-NEXT: .LBB43_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB43_9 +; RV64ZVE32F-NEXT: j .LBB43_10 +; RV64ZVE32F-NEXT: .LBB43_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_12 +; RV64ZVE32F-NEXT: .LBB43_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a2, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m) @@ -846,16 +4295,16 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i } define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i8> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf8 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_zext_v8i8_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf8 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_zext_v8i8_v8i64: ; RV64: # %bb.0: @@ -864,6 +4313,239 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: lw a2, 60(a0) +; RV32ZVE32F-NEXT: lw a3, 56(a0) +; RV32ZVE32F-NEXT: lw a4, 52(a0) +; RV32ZVE32F-NEXT: lw a5, 48(a0) +; RV32ZVE32F-NEXT: lw a6, 44(a0) +; RV32ZVE32F-NEXT: lw a7, 40(a0) +; RV32ZVE32F-NEXT: lw t0, 36(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 28(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 20(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vzext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB44_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB44_11 +; RV32ZVE32F-NEXT: .LBB44_2: # %else2 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB44_12 +; RV32ZVE32F-NEXT: .LBB44_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB44_13 +; RV32ZVE32F-NEXT: .LBB44_4: # %else6 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB44_14 +; RV32ZVE32F-NEXT: .LBB44_5: # %else8 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB44_15 +; RV32ZVE32F-NEXT: .LBB44_6: # %else10 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB44_16 +; RV32ZVE32F-NEXT: .LBB44_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB44_9 +; RV32ZVE32F-NEXT: .LBB44_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: .LBB44_9: # %else14 +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB44_10: # %cond.store +; RV32ZVE32F-NEXT: lw s1, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw s1, 4(s2) +; RV32ZVE32F-NEXT: sw a0, 0(s2) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB44_2 +; RV32ZVE32F-NEXT: .LBB44_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB44_3 +; RV32ZVE32F-NEXT: .LBB44_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB44_4 +; RV32ZVE32F-NEXT: .LBB44_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB44_5 +; RV32ZVE32F-NEXT: .LBB44_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB44_6 +; RV32ZVE32F-NEXT: .LBB44_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a7, 0(a0) +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB44_7 +; RV32ZVE32F-NEXT: .LBB44_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB44_8 +; RV32ZVE32F-NEXT: j .LBB44_9 +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a2, 56(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 +; RV64ZVE32F-NEXT: beqz t2, .LBB44_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: andi t2, t2, 255 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: sd a0, 0(t2) +; RV64ZVE32F-NEXT: .LBB44_2: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: andi a0, a0, 255 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t1, 0(a0) +; RV64ZVE32F-NEXT: .LBB44_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: andi a0, a0, 255 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB44_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB44_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB44_14 +; RV64ZVE32F-NEXT: .LBB44_8: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_10 +; RV64ZVE32F-NEXT: .LBB44_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: andi a0, a0, 255 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB44_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB44_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB44_16 +; RV64ZVE32F-NEXT: .LBB44_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB44_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: andi a0, a0, 255 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_8 +; RV64ZVE32F-NEXT: .LBB44_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: andi a0, a0, 255 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB44_9 +; RV64ZVE32F-NEXT: j .LBB44_10 +; RV64ZVE32F-NEXT: .LBB44_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: andi a0, a0, 255 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_12 +; RV64ZVE32F-NEXT: .LBB44_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: andi a0, a0, 255 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a2, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m) @@ -871,14 +4553,14 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, i64* %base, <8 x i } define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x i16> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_v8i16_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsext.vf2 v14, v12 -; RV32-NEXT: vsll.vi v12, v14, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_v8i16_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsext.vf2 v14, v12 +; RV32V-NEXT: vsll.vi v12, v14, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_v8i16_v8i64: ; RV64: # %bb.0: @@ -887,22 +4569,248 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x i16> ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_v8i16_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: lw a2, 60(a0) +; RV32ZVE32F-NEXT: lw a3, 56(a0) +; RV32ZVE32F-NEXT: lw a4, 52(a0) +; RV32ZVE32F-NEXT: lw a5, 48(a0) +; RV32ZVE32F-NEXT: lw a6, 44(a0) +; RV32ZVE32F-NEXT: lw a7, 40(a0) +; RV32ZVE32F-NEXT: lw t0, 36(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 28(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 20(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB45_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB45_11 +; RV32ZVE32F-NEXT: .LBB45_2: # %else2 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB45_12 +; RV32ZVE32F-NEXT: .LBB45_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB45_13 +; RV32ZVE32F-NEXT: .LBB45_4: # %else6 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB45_14 +; RV32ZVE32F-NEXT: .LBB45_5: # %else8 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB45_15 +; RV32ZVE32F-NEXT: .LBB45_6: # %else10 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB45_16 +; RV32ZVE32F-NEXT: .LBB45_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB45_9 +; RV32ZVE32F-NEXT: .LBB45_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: .LBB45_9: # %else14 +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB45_10: # %cond.store +; RV32ZVE32F-NEXT: lw s1, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw s1, 4(s2) +; RV32ZVE32F-NEXT: sw a0, 0(s2) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB45_2 +; RV32ZVE32F-NEXT: .LBB45_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB45_3 +; RV32ZVE32F-NEXT: .LBB45_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB45_4 +; RV32ZVE32F-NEXT: .LBB45_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB45_5 +; RV32ZVE32F-NEXT: .LBB45_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB45_6 +; RV32ZVE32F-NEXT: .LBB45_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a7, 0(a0) +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB45_7 +; RV32ZVE32F-NEXT: .LBB45_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB45_8 +; RV32ZVE32F-NEXT: j .LBB45_9 +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i16_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a2, 56(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 +; RV64ZVE32F-NEXT: beqz t2, .LBB45_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: sd a0, 0(t2) +; RV64ZVE32F-NEXT: .LBB45_2: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t1, 0(a0) +; RV64ZVE32F-NEXT: .LBB45_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB45_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB45_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB45_14 +; RV64ZVE32F-NEXT: .LBB45_8: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_10 +; RV64ZVE32F-NEXT: .LBB45_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB45_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB45_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB45_16 +; RV64ZVE32F-NEXT: .LBB45_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB45_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_8 +; RV64ZVE32F-NEXT: .LBB45_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB45_9 +; RV64ZVE32F-NEXT: j .LBB45_10 +; RV64ZVE32F-NEXT: .LBB45_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_12 +; RV64ZVE32F-NEXT: .LBB45_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a2, 0(a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i64, i64* %base, <8 x i16> %idxs call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m) ret void } define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x i16> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf4 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_sext_v8i16_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf4 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_sext_v8i16_v8i64: ; RV64: # %bb.0: @@ -911,6 +4819,232 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: lw a2, 60(a0) +; RV32ZVE32F-NEXT: lw a3, 56(a0) +; RV32ZVE32F-NEXT: lw a4, 52(a0) +; RV32ZVE32F-NEXT: lw a5, 48(a0) +; RV32ZVE32F-NEXT: lw a6, 44(a0) +; RV32ZVE32F-NEXT: lw a7, 40(a0) +; RV32ZVE32F-NEXT: lw t0, 36(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 28(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 20(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB46_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB46_11 +; RV32ZVE32F-NEXT: .LBB46_2: # %else2 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB46_12 +; RV32ZVE32F-NEXT: .LBB46_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB46_13 +; RV32ZVE32F-NEXT: .LBB46_4: # %else6 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB46_14 +; RV32ZVE32F-NEXT: .LBB46_5: # %else8 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB46_15 +; RV32ZVE32F-NEXT: .LBB46_6: # %else10 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB46_16 +; RV32ZVE32F-NEXT: .LBB46_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB46_9 +; RV32ZVE32F-NEXT: .LBB46_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: .LBB46_9: # %else14 +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB46_10: # %cond.store +; RV32ZVE32F-NEXT: lw s1, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw s1, 4(s2) +; RV32ZVE32F-NEXT: sw a0, 0(s2) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB46_2 +; RV32ZVE32F-NEXT: .LBB46_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB46_3 +; RV32ZVE32F-NEXT: .LBB46_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB46_4 +; RV32ZVE32F-NEXT: .LBB46_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB46_5 +; RV32ZVE32F-NEXT: .LBB46_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB46_6 +; RV32ZVE32F-NEXT: .LBB46_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a7, 0(a0) +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB46_7 +; RV32ZVE32F-NEXT: .LBB46_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB46_8 +; RV32ZVE32F-NEXT: j .LBB46_9 +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a2, 56(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 +; RV64ZVE32F-NEXT: beqz t2, .LBB46_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: sd a0, 0(t2) +; RV64ZVE32F-NEXT: .LBB46_2: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t1, 0(a0) +; RV64ZVE32F-NEXT: .LBB46_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB46_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB46_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB46_14 +; RV64ZVE32F-NEXT: .LBB46_8: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_10 +; RV64ZVE32F-NEXT: .LBB46_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB46_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB46_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB46_16 +; RV64ZVE32F-NEXT: .LBB46_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB46_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_8 +; RV64ZVE32F-NEXT: .LBB46_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB46_9 +; RV64ZVE32F-NEXT: j .LBB46_10 +; RV64ZVE32F-NEXT: .LBB46_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_12 +; RV64ZVE32F-NEXT: .LBB46_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a2, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m) @@ -918,16 +5052,16 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x } define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x i16> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf4 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_zext_v8i16_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf4 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_zext_v8i16_v8i64: ; RV64: # %bb.0: @@ -936,6 +5070,242 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: lw a2, 60(a0) +; RV32ZVE32F-NEXT: lw a3, 56(a0) +; RV32ZVE32F-NEXT: lw a4, 52(a0) +; RV32ZVE32F-NEXT: lw a5, 48(a0) +; RV32ZVE32F-NEXT: lw a6, 44(a0) +; RV32ZVE32F-NEXT: lw a7, 40(a0) +; RV32ZVE32F-NEXT: lw t0, 36(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 28(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 20(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB47_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB47_11 +; RV32ZVE32F-NEXT: .LBB47_2: # %else2 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB47_12 +; RV32ZVE32F-NEXT: .LBB47_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB47_13 +; RV32ZVE32F-NEXT: .LBB47_4: # %else6 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB47_14 +; RV32ZVE32F-NEXT: .LBB47_5: # %else8 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB47_15 +; RV32ZVE32F-NEXT: .LBB47_6: # %else10 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB47_16 +; RV32ZVE32F-NEXT: .LBB47_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB47_9 +; RV32ZVE32F-NEXT: .LBB47_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: .LBB47_9: # %else14 +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB47_10: # %cond.store +; RV32ZVE32F-NEXT: lw s1, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw s1, 4(s2) +; RV32ZVE32F-NEXT: sw a0, 0(s2) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB47_2 +; RV32ZVE32F-NEXT: .LBB47_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB47_3 +; RV32ZVE32F-NEXT: .LBB47_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB47_4 +; RV32ZVE32F-NEXT: .LBB47_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB47_5 +; RV32ZVE32F-NEXT: .LBB47_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB47_6 +; RV32ZVE32F-NEXT: .LBB47_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a7, 0(a0) +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB47_7 +; RV32ZVE32F-NEXT: .LBB47_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB47_8 +; RV32ZVE32F-NEXT: j .LBB47_9 +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a2, 56(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a6, 40(a0) +; RV64ZVE32F-NEXT: ld a7, 32(a0) +; RV64ZVE32F-NEXT: ld t0, 24(a0) +; RV64ZVE32F-NEXT: ld t1, 16(a0) +; RV64ZVE32F-NEXT: ld t2, 8(a0) +; RV64ZVE32F-NEXT: lui a4, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi t3, a5, 1 +; RV64ZVE32F-NEXT: addiw a4, a4, -1 +; RV64ZVE32F-NEXT: beqz t3, .LBB47_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t3, v8 +; RV64ZVE32F-NEXT: and t3, t3, a4 +; RV64ZVE32F-NEXT: slli t3, t3, 3 +; RV64ZVE32F-NEXT: add t3, a1, t3 +; RV64ZVE32F-NEXT: sd a0, 0(t3) +; RV64ZVE32F-NEXT: .LBB47_2: # %else +; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: and a0, a0, a4 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t2, 0(a0) +; RV64ZVE32F-NEXT: .LBB47_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: and a0, a0, a4 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t1, 0(a0) +; RV64ZVE32F-NEXT: .LBB47_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a5, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB47_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a0, a5, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB47_14 +; RV64ZVE32F-NEXT: .LBB47_8: # %else8 +; RV64ZVE32F-NEXT: andi a0, a5, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_10 +; RV64ZVE32F-NEXT: .LBB47_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: and a0, a0, a4 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB47_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a0, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB47_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a0, a5, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB47_16 +; RV64ZVE32F-NEXT: .LBB47_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB47_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: and a0, a0, a4 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a5, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_8 +; RV64ZVE32F-NEXT: .LBB47_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: and a0, a0, a4 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a5, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB47_9 +; RV64ZVE32F-NEXT: j .LBB47_10 +; RV64ZVE32F-NEXT: .LBB47_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: and a0, a0, a4 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a5, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_12 +; RV64ZVE32F-NEXT: .LBB47_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: and a0, a0, a4 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a2, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m) @@ -943,13 +5313,13 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, i64* %base, <8 x } define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x i32> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_v8i32_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsll.vi v12, v12, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_v8i32_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsll.vi v12, v12, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_v8i32_v8i64: ; RV64: # %bb.0: @@ -958,22 +5328,249 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x i32> ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_v8i32_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: lw a2, 60(a0) +; RV32ZVE32F-NEXT: lw a3, 56(a0) +; RV32ZVE32F-NEXT: lw a4, 52(a0) +; RV32ZVE32F-NEXT: lw a5, 48(a0) +; RV32ZVE32F-NEXT: lw a6, 44(a0) +; RV32ZVE32F-NEXT: lw a7, 40(a0) +; RV32ZVE32F-NEXT: lw t0, 36(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 28(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 20(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB48_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB48_11 +; RV32ZVE32F-NEXT: .LBB48_2: # %else2 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB48_12 +; RV32ZVE32F-NEXT: .LBB48_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB48_13 +; RV32ZVE32F-NEXT: .LBB48_4: # %else6 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB48_14 +; RV32ZVE32F-NEXT: .LBB48_5: # %else8 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB48_15 +; RV32ZVE32F-NEXT: .LBB48_6: # %else10 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB48_16 +; RV32ZVE32F-NEXT: .LBB48_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB48_9 +; RV32ZVE32F-NEXT: .LBB48_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: .LBB48_9: # %else14 +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB48_10: # %cond.store +; RV32ZVE32F-NEXT: lw s1, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw s1, 4(s2) +; RV32ZVE32F-NEXT: sw a0, 0(s2) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB48_2 +; RV32ZVE32F-NEXT: .LBB48_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB48_3 +; RV32ZVE32F-NEXT: .LBB48_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB48_4 +; RV32ZVE32F-NEXT: .LBB48_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB48_5 +; RV32ZVE32F-NEXT: .LBB48_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB48_6 +; RV32ZVE32F-NEXT: .LBB48_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a7, 0(a0) +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB48_7 +; RV32ZVE32F-NEXT: .LBB48_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB48_8 +; RV32ZVE32F-NEXT: j .LBB48_9 +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i32_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a2, 56(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 +; RV64ZVE32F-NEXT: beqz t2, .LBB48_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: sd a0, 0(t2) +; RV64ZVE32F-NEXT: .LBB48_2: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t1, 0(a0) +; RV64ZVE32F-NEXT: .LBB48_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB48_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB48_13 +; RV64ZVE32F-NEXT: .LBB48_6: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB48_14 +; RV64ZVE32F-NEXT: .LBB48_7: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_9 +; RV64ZVE32F-NEXT: .LBB48_8: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB48_9: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB48_15 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB48_16 +; RV64ZVE32F-NEXT: .LBB48_11: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB48_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_6 +; RV64ZVE32F-NEXT: .LBB48_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_7 +; RV64ZVE32F-NEXT: .LBB48_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB48_8 +; RV64ZVE32F-NEXT: j .LBB48_9 +; RV64ZVE32F-NEXT: .LBB48_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_11 +; RV64ZVE32F-NEXT: .LBB48_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a2, 0(a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i64, i64* %base, <8 x i32> %idxs call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m) ret void } define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x i32> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_sext_v8i32_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf2 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_sext_v8i32_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf2 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_sext_v8i32_v8i64: ; RV64: # %bb.0: @@ -982,6 +5579,233 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: lw a2, 60(a0) +; RV32ZVE32F-NEXT: lw a3, 56(a0) +; RV32ZVE32F-NEXT: lw a4, 52(a0) +; RV32ZVE32F-NEXT: lw a5, 48(a0) +; RV32ZVE32F-NEXT: lw a6, 44(a0) +; RV32ZVE32F-NEXT: lw a7, 40(a0) +; RV32ZVE32F-NEXT: lw t0, 36(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 28(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 20(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB49_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB49_11 +; RV32ZVE32F-NEXT: .LBB49_2: # %else2 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB49_12 +; RV32ZVE32F-NEXT: .LBB49_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB49_13 +; RV32ZVE32F-NEXT: .LBB49_4: # %else6 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB49_14 +; RV32ZVE32F-NEXT: .LBB49_5: # %else8 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB49_15 +; RV32ZVE32F-NEXT: .LBB49_6: # %else10 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB49_16 +; RV32ZVE32F-NEXT: .LBB49_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB49_9 +; RV32ZVE32F-NEXT: .LBB49_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: .LBB49_9: # %else14 +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB49_10: # %cond.store +; RV32ZVE32F-NEXT: lw s1, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw s1, 4(s2) +; RV32ZVE32F-NEXT: sw a0, 0(s2) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB49_2 +; RV32ZVE32F-NEXT: .LBB49_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB49_3 +; RV32ZVE32F-NEXT: .LBB49_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB49_4 +; RV32ZVE32F-NEXT: .LBB49_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB49_5 +; RV32ZVE32F-NEXT: .LBB49_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB49_6 +; RV32ZVE32F-NEXT: .LBB49_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a7, 0(a0) +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB49_7 +; RV32ZVE32F-NEXT: .LBB49_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB49_8 +; RV32ZVE32F-NEXT: j .LBB49_9 +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a2, 56(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 +; RV64ZVE32F-NEXT: beqz t2, .LBB49_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 3 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: sd a0, 0(t2) +; RV64ZVE32F-NEXT: .LBB49_2: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t1, 0(a0) +; RV64ZVE32F-NEXT: .LBB49_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB49_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB49_13 +; RV64ZVE32F-NEXT: .LBB49_6: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB49_14 +; RV64ZVE32F-NEXT: .LBB49_7: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_9 +; RV64ZVE32F-NEXT: .LBB49_8: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB49_9: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB49_15 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB49_16 +; RV64ZVE32F-NEXT: .LBB49_11: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB49_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_6 +; RV64ZVE32F-NEXT: .LBB49_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_7 +; RV64ZVE32F-NEXT: .LBB49_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB49_8 +; RV64ZVE32F-NEXT: j .LBB49_9 +; RV64ZVE32F-NEXT: .LBB49_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_11 +; RV64ZVE32F-NEXT: .LBB49_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a2, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m) @@ -989,16 +5813,16 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x } define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x i32> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_zext_v8i32_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf2 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_zext_v8i32_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf2 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_zext_v8i32_v8i64: ; RV64: # %bb.0: @@ -1007,6 +5831,241 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 4(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset s0, -4 +; RV32ZVE32F-NEXT: .cfi_offset s1, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: lw a2, 60(a0) +; RV32ZVE32F-NEXT: lw a3, 56(a0) +; RV32ZVE32F-NEXT: lw a4, 52(a0) +; RV32ZVE32F-NEXT: lw a5, 48(a0) +; RV32ZVE32F-NEXT: lw a6, 44(a0) +; RV32ZVE32F-NEXT: lw a7, 40(a0) +; RV32ZVE32F-NEXT: lw t0, 36(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 28(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 20(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB50_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB50_11 +; RV32ZVE32F-NEXT: .LBB50_2: # %else2 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB50_12 +; RV32ZVE32F-NEXT: .LBB50_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB50_13 +; RV32ZVE32F-NEXT: .LBB50_4: # %else6 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB50_14 +; RV32ZVE32F-NEXT: .LBB50_5: # %else8 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB50_15 +; RV32ZVE32F-NEXT: .LBB50_6: # %else10 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB50_16 +; RV32ZVE32F-NEXT: .LBB50_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB50_9 +; RV32ZVE32F-NEXT: .LBB50_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a3, 0(a0) +; RV32ZVE32F-NEXT: sw a2, 4(a0) +; RV32ZVE32F-NEXT: .LBB50_9: # %else14 +; RV32ZVE32F-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 4(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB50_10: # %cond.store +; RV32ZVE32F-NEXT: lw s1, 4(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw s1, 4(s2) +; RV32ZVE32F-NEXT: sw a0, 0(s2) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB50_2 +; RV32ZVE32F-NEXT: .LBB50_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB50_3 +; RV32ZVE32F-NEXT: .LBB50_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB50_4 +; RV32ZVE32F-NEXT: .LBB50_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB50_5 +; RV32ZVE32F-NEXT: .LBB50_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB50_6 +; RV32ZVE32F-NEXT: .LBB50_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a7, 0(a0) +; RV32ZVE32F-NEXT: sw a6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB50_7 +; RV32ZVE32F-NEXT: .LBB50_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw a5, 0(a0) +; RV32ZVE32F-NEXT: sw a4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB50_8 +; RV32ZVE32F-NEXT: j .LBB50_9 +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a2, 56(a0) +; RV64ZVE32F-NEXT: ld a3, 48(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a6, 32(a0) +; RV64ZVE32F-NEXT: ld a7, 24(a0) +; RV64ZVE32F-NEXT: ld t0, 16(a0) +; RV64ZVE32F-NEXT: ld t1, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 +; RV64ZVE32F-NEXT: beqz t2, .LBB50_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 +; RV64ZVE32F-NEXT: slli t2, t2, 32 +; RV64ZVE32F-NEXT: srli t2, t2, 29 +; RV64ZVE32F-NEXT: add t2, a1, t2 +; RV64ZVE32F-NEXT: sd a0, 0(t2) +; RV64ZVE32F-NEXT: .LBB50_2: # %else +; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 32 +; RV64ZVE32F-NEXT: srli a0, a0, 29 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t1, 0(a0) +; RV64ZVE32F-NEXT: .LBB50_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_13 +; RV64ZVE32F-NEXT: .LBB50_6: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_14 +; RV64ZVE32F-NEXT: .LBB50_7: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_9 +; RV64ZVE32F-NEXT: .LBB50_8: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 32 +; RV64ZVE32F-NEXT: srli a0, a0, 29 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB50_9: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_15 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_16 +; RV64ZVE32F-NEXT: .LBB50_11: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB50_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 32 +; RV64ZVE32F-NEXT: srli a0, a0, 29 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_6 +; RV64ZVE32F-NEXT: .LBB50_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 32 +; RV64ZVE32F-NEXT: srli a0, a0, 29 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_7 +; RV64ZVE32F-NEXT: .LBB50_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 32 +; RV64ZVE32F-NEXT: srli a0, a0, 29 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_8 +; RV64ZVE32F-NEXT: j .LBB50_9 +; RV64ZVE32F-NEXT: .LBB50_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 32 +; RV64ZVE32F-NEXT: srli a0, a0, 29 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_11 +; RV64ZVE32F-NEXT: .LBB50_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 32 +; RV64ZVE32F-NEXT: srli a0, a0, 29 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a2, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m) @@ -1014,15 +6073,15 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, i64* %base, <8 x } define void @mscatter_baseidx_v8i64(<8 x i64> %val, i64* %base, <8 x i64> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsll.vi v12, v12, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_v8i64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsll.vi v12, v12, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_v8i64: ; RV64: # %bb.0: @@ -1030,6 +6089,286 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, i64* %base, <8 x i64> %idxs, ; RV64-NEXT: vsll.vi v12, v12, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -128 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 128 +; RV32ZVE32F-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s2, 116(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s3, 112(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s4, 108(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s5, 104(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s6, 100(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s7, 96(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s8, 92(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s9, 88(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s10, 84(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s11, 80(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset ra, -4 +; RV32ZVE32F-NEXT: .cfi_offset s0, -8 +; RV32ZVE32F-NEXT: .cfi_offset s2, -12 +; RV32ZVE32F-NEXT: .cfi_offset s3, -16 +; RV32ZVE32F-NEXT: .cfi_offset s4, -20 +; RV32ZVE32F-NEXT: .cfi_offset s5, -24 +; RV32ZVE32F-NEXT: .cfi_offset s6, -28 +; RV32ZVE32F-NEXT: .cfi_offset s7, -32 +; RV32ZVE32F-NEXT: .cfi_offset s8, -36 +; RV32ZVE32F-NEXT: .cfi_offset s9, -40 +; RV32ZVE32F-NEXT: .cfi_offset s10, -44 +; RV32ZVE32F-NEXT: .cfi_offset s11, -48 +; RV32ZVE32F-NEXT: addi s0, sp, 128 +; RV32ZVE32F-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVE32F-NEXT: andi sp, sp, -32 +; RV32ZVE32F-NEXT: lw a3, 60(a0) +; RV32ZVE32F-NEXT: lw a4, 56(a0) +; RV32ZVE32F-NEXT: lw a5, 52(a0) +; RV32ZVE32F-NEXT: lw a6, 48(a0) +; RV32ZVE32F-NEXT: lw a7, 44(a0) +; RV32ZVE32F-NEXT: lw t0, 40(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw t2, 32(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t4, 24(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t6, 16(a0) +; RV32ZVE32F-NEXT: lw s3, 12(a0) +; RV32ZVE32F-NEXT: lw s2, 8(a0) +; RV32ZVE32F-NEXT: lw s5, 4(a0) +; RV32ZVE32F-NEXT: lw s4, 0(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a2) +; RV32ZVE32F-NEXT: lw s6, 8(a2) +; RV32ZVE32F-NEXT: lw s7, 16(a2) +; RV32ZVE32F-NEXT: lw s8, 24(a2) +; RV32ZVE32F-NEXT: lw s9, 56(a2) +; RV32ZVE32F-NEXT: lw s10, 48(a2) +; RV32ZVE32F-NEXT: lw s11, 40(a2) +; RV32ZVE32F-NEXT: lw a2, 32(a2) +; RV32ZVE32F-NEXT: sw s9, 60(sp) +; RV32ZVE32F-NEXT: sw s10, 56(sp) +; RV32ZVE32F-NEXT: sw s11, 52(sp) +; RV32ZVE32F-NEXT: sw a2, 48(sp) +; RV32ZVE32F-NEXT: sw s8, 44(sp) +; RV32ZVE32F-NEXT: sw s7, 40(sp) +; RV32ZVE32F-NEXT: sw s6, 36(sp) +; RV32ZVE32F-NEXT: sw a0, 32(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: addi a0, sp, 32 +; RV32ZVE32F-NEXT: vle32.v v8, (a0) +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB51_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB51_11 +; RV32ZVE32F-NEXT: .LBB51_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB51_12 +; RV32ZVE32F-NEXT: .LBB51_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB51_13 +; RV32ZVE32F-NEXT: .LBB51_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB51_14 +; RV32ZVE32F-NEXT: .LBB51_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB51_15 +; RV32ZVE32F-NEXT: .LBB51_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB51_16 +; RV32ZVE32F-NEXT: .LBB51_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB51_9 +; RV32ZVE32F-NEXT: .LBB51_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: sw a4, 0(a0) +; RV32ZVE32F-NEXT: sw a3, 4(a0) +; RV32ZVE32F-NEXT: .LBB51_9: # %else14 +; RV32ZVE32F-NEXT: addi sp, s0, -128 +; RV32ZVE32F-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s2, 116(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s3, 112(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s4, 108(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s5, 104(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s6, 100(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s7, 96(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s8, 92(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s9, 88(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s10, 84(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s11, 80(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 128 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB51_10: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: sw s5, 4(a1) +; RV32ZVE32F-NEXT: sw s4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB51_2 +; RV32ZVE32F-NEXT: .LBB51_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: sw s3, 4(a1) +; RV32ZVE32F-NEXT: sw s2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB51_3 +; RV32ZVE32F-NEXT: .LBB51_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: sw t6, 0(a1) +; RV32ZVE32F-NEXT: sw t5, 4(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB51_4 +; RV32ZVE32F-NEXT: .LBB51_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: sw t4, 0(a1) +; RV32ZVE32F-NEXT: sw t3, 4(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB51_5 +; RV32ZVE32F-NEXT: .LBB51_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: sw t2, 0(a1) +; RV32ZVE32F-NEXT: sw t1, 4(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB51_6 +; RV32ZVE32F-NEXT: .LBB51_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: sw t0, 0(a1) +; RV32ZVE32F-NEXT: sw a7, 4(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB51_7 +; RV32ZVE32F-NEXT: .LBB51_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: sw a6, 0(a1) +; RV32ZVE32F-NEXT: sw a5, 4(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB51_8 +; RV32ZVE32F-NEXT: j .LBB51_9 +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -32 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 32 +; RV64ZVE32F-NEXT: sd s0, 24(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s1, 16(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s2, 8(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: sd s3, 0(sp) # 8-byte Folded Spill +; RV64ZVE32F-NEXT: .cfi_offset s0, -8 +; RV64ZVE32F-NEXT: .cfi_offset s1, -16 +; RV64ZVE32F-NEXT: .cfi_offset s2, -24 +; RV64ZVE32F-NEXT: .cfi_offset s3, -32 +; RV64ZVE32F-NEXT: ld a3, 56(a0) +; RV64ZVE32F-NEXT: ld a4, 48(a0) +; RV64ZVE32F-NEXT: ld a6, 40(a0) +; RV64ZVE32F-NEXT: ld t1, 32(a0) +; RV64ZVE32F-NEXT: ld t3, 24(a0) +; RV64ZVE32F-NEXT: ld t6, 16(a0) +; RV64ZVE32F-NEXT: ld s1, 8(a0) +; RV64ZVE32F-NEXT: ld s2, 8(a2) +; RV64ZVE32F-NEXT: ld s0, 16(a2) +; RV64ZVE32F-NEXT: ld t5, 24(a2) +; RV64ZVE32F-NEXT: ld t4, 32(a2) +; RV64ZVE32F-NEXT: ld t2, 40(a2) +; RV64ZVE32F-NEXT: ld t0, 48(a2) +; RV64ZVE32F-NEXT: ld a5, 56(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a7, v0 +; RV64ZVE32F-NEXT: andi s3, a7, 1 +; RV64ZVE32F-NEXT: bnez s3, .LBB51_10 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a7, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB51_11 +; RV64ZVE32F-NEXT: .LBB51_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a7, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB51_12 +; RV64ZVE32F-NEXT: .LBB51_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a7, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB51_13 +; RV64ZVE32F-NEXT: .LBB51_4: # %else6 +; RV64ZVE32F-NEXT: andi a0, a7, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB51_14 +; RV64ZVE32F-NEXT: .LBB51_5: # %else8 +; RV64ZVE32F-NEXT: andi a0, a7, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB51_15 +; RV64ZVE32F-NEXT: .LBB51_6: # %else10 +; RV64ZVE32F-NEXT: andi a0, a7, 64 +; RV64ZVE32F-NEXT: bnez a0, .LBB51_16 +; RV64ZVE32F-NEXT: .LBB51_7: # %else12 +; RV64ZVE32F-NEXT: andi a0, a7, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB51_9 +; RV64ZVE32F-NEXT: .LBB51_8: # %cond.store13 +; RV64ZVE32F-NEXT: slli a0, a5, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: .LBB51_9: # %else14 +; RV64ZVE32F-NEXT: ld s0, 24(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s1, 16(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s2, 8(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: ld s3, 0(sp) # 8-byte Folded Reload +; RV64ZVE32F-NEXT: addi sp, sp, 32 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB51_10: # %cond.store +; RV64ZVE32F-NEXT: ld a2, 0(a2) +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a1, a2 +; RV64ZVE32F-NEXT: sd a0, 0(a2) +; RV64ZVE32F-NEXT: andi a0, a7, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB51_2 +; RV64ZVE32F-NEXT: .LBB51_11: # %cond.store1 +; RV64ZVE32F-NEXT: slli a0, s2, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd s1, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a7, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB51_3 +; RV64ZVE32F-NEXT: .LBB51_12: # %cond.store3 +; RV64ZVE32F-NEXT: slli a0, s0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t6, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a7, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB51_4 +; RV64ZVE32F-NEXT: .LBB51_13: # %cond.store5 +; RV64ZVE32F-NEXT: slli a0, t5, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t3, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a7, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB51_5 +; RV64ZVE32F-NEXT: .LBB51_14: # %cond.store7 +; RV64ZVE32F-NEXT: slli a0, t4, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t1, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a7, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB51_6 +; RV64ZVE32F-NEXT: .LBB51_15: # %cond.store9 +; RV64ZVE32F-NEXT: slli a0, t2, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a7, 64 +; RV64ZVE32F-NEXT: beqz a0, .LBB51_7 +; RV64ZVE32F-NEXT: .LBB51_16: # %cond.store11 +; RV64ZVE32F-NEXT: slli a0, t0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a4, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a7, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB51_8 +; RV64ZVE32F-NEXT: j .LBB51_9 %ptrs = getelementptr inbounds i64, i64* %base, <8 x i64> %idxs call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> %val, <8 x i64*> %ptrs, i32 8, <8 x i1> %m) ret void @@ -1038,17 +6377,37 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, i64* %base, <8 x i64> %idxs, declare void @llvm.masked.scatter.v1f16.v1p0f16(<1 x half>, <1 x half*>, i32, <1 x i1>) define void @mscatter_v1f16(<1 x half> %val, <1 x half*> %ptrs, <1 x i1> %m) { -; RV32-LABEL: mscatter_v1f16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v1f16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v1f16: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v1f16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v1f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB52_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: .LBB52_2: # %else +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v1f16.v1p0f16(<1 x half> %val, <1 x half*> %ptrs, i32 2, <1 x i1> %m) ret void } @@ -1056,17 +6415,59 @@ define void @mscatter_v1f16(<1 x half> %val, <1 x half*> %ptrs, <1 x i1> %m) { declare void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half>, <2 x half*>, i32, <2 x i1>) define void @mscatter_v2f16(<2 x half> %val, <2 x half*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2f16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2f16: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2f16: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2f16: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB53_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB53_4 +; RV64ZVE32F-NEXT: .LBB53_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB53_3: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB53_2 +; RV64ZVE32F-NEXT: .LBB53_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> %val, <2 x half*> %ptrs, i32 2, <2 x i1> %m) ret void } @@ -1085,6 +6486,64 @@ define void @mscatter_v4f16(<4 x half> %val, <4 x half*> %ptrs, <4 x i1> %m) { ; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v4f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a1) +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: bnez a5, .LBB54_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB54_6 +; RV64ZVE32F-NEXT: .LBB54_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB54_7 +; RV64ZVE32F-NEXT: .LBB54_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB54_8 +; RV64ZVE32F-NEXT: .LBB54_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB54_5: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB54_2 +; RV64ZVE32F-NEXT: .LBB54_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB54_3 +; RV64ZVE32F-NEXT: .LBB54_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v9, (a3) +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB54_4 +; RV64ZVE32F-NEXT: .LBB54_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %val, <4 x half*> %ptrs, i32 2, <4 x i1> %m) ret void } @@ -1101,6 +6560,64 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x half*> %ptrs) { ; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10 ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_truemask_v4f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 8(a0) +; RV64ZVE32F-NEXT: ld a4, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB55_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB55_6 +; RV64ZVE32F-NEXT: .LBB55_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB55_7 +; RV64ZVE32F-NEXT: .LBB55_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB55_8 +; RV64ZVE32F-NEXT: .LBB55_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB55_5: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a4) +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB55_2 +; RV64ZVE32F-NEXT: .LBB55_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v9, (a3) +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB55_3 +; RV64ZVE32F-NEXT: .LBB55_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB55_4 +; RV64ZVE32F-NEXT: .LBB55_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %val, <4 x half*> %ptrs, i32 2, <4 x i1> %mtrue) @@ -1129,6 +6646,90 @@ define void @mscatter_v8f16(<8 x half> %val, <8 x half*> %ptrs, <8 x i1> %m) { ; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v8f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a1, 56(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi t1, a3, 1 +; RV64ZVE32F-NEXT: bnez t1, .LBB57_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB57_10 +; RV64ZVE32F-NEXT: .LBB57_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB57_11 +; RV64ZVE32F-NEXT: .LBB57_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB57_12 +; RV64ZVE32F-NEXT: .LBB57_4: # %else6 +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB57_13 +; RV64ZVE32F-NEXT: .LBB57_5: # %else8 +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB57_14 +; RV64ZVE32F-NEXT: .LBB57_6: # %else10 +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: bnez a0, .LBB57_15 +; RV64ZVE32F-NEXT: .LBB57_7: # %else12 +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB57_16 +; RV64ZVE32F-NEXT: .LBB57_8: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB57_9: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB57_2 +; RV64ZVE32F-NEXT: .LBB57_10: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v9, (t0) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB57_3 +; RV64ZVE32F-NEXT: .LBB57_11: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v9, (a7) +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB57_4 +; RV64ZVE32F-NEXT: .LBB57_12: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a6) +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB57_5 +; RV64ZVE32F-NEXT: .LBB57_13: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a5) +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB57_6 +; RV64ZVE32F-NEXT: .LBB57_14: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: beqz a0, .LBB57_7 +; RV64ZVE32F-NEXT: .LBB57_15: # %cond.store11 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 6 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB57_8 +; RV64ZVE32F-NEXT: .LBB57_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v8, (a1) +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m) ret void } @@ -1151,6 +6752,114 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, half* %base, <8 x i8> ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8_v8f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB58_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB58_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB58_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB58_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB58_14 +; RV64ZVE32F-NEXT: .LBB58_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_10 +; RV64ZVE32F-NEXT: .LBB58_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB58_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB58_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB58_16 +; RV64ZVE32F-NEXT: .LBB58_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB58_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_8 +; RV64ZVE32F-NEXT: .LBB58_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB58_9 +; RV64ZVE32F-NEXT: j .LBB58_10 +; RV64ZVE32F-NEXT: .LBB58_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB58_12 +; RV64ZVE32F-NEXT: .LBB58_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds half, half* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m) ret void @@ -1174,6 +6883,114 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, half* %base, <8 x ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB59_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB59_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB59_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB59_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB59_14 +; RV64ZVE32F-NEXT: .LBB59_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_10 +; RV64ZVE32F-NEXT: .LBB59_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB59_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB59_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB59_16 +; RV64ZVE32F-NEXT: .LBB59_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB59_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_8 +; RV64ZVE32F-NEXT: .LBB59_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB59_9 +; RV64ZVE32F-NEXT: j .LBB59_10 +; RV64ZVE32F-NEXT: .LBB59_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB59_12 +; RV64ZVE32F-NEXT: .LBB59_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %eidxs call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m) @@ -1198,6 +7015,122 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, half* %base, <8 x ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB60_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB60_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB60_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_14 +; RV64ZVE32F-NEXT: .LBB60_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_10 +; RV64ZVE32F-NEXT: .LBB60_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB60_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB60_16 +; RV64ZVE32F-NEXT: .LBB60_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB60_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_8 +; RV64ZVE32F-NEXT: .LBB60_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_9 +; RV64ZVE32F-NEXT: j .LBB60_10 +; RV64ZVE32F-NEXT: .LBB60_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB60_12 +; RV64ZVE32F-NEXT: .LBB60_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 255 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %eidxs call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m) @@ -1222,6 +7155,115 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, half* %base, <8 x i16> %idx ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8f16: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vse16.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB61_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB61_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB61_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB61_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB61_14 +; RV64ZVE32F-NEXT: .LBB61_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_10 +; RV64ZVE32F-NEXT: .LBB61_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB61_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB61_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB61_16 +; RV64ZVE32F-NEXT: .LBB61_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB61_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_8 +; RV64ZVE32F-NEXT: .LBB61_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB61_9 +; RV64ZVE32F-NEXT: j .LBB61_10 +; RV64ZVE32F-NEXT: .LBB61_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB61_12 +; RV64ZVE32F-NEXT: .LBB61_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: slli a1, a1, 1 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse16.v v8, (a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds half, half* %base, <8 x i16> %idxs call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %val, <8 x half*> %ptrs, i32 2, <8 x i1> %m) ret void @@ -1230,17 +7272,37 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, half* %base, <8 x i16> %idx declare void @llvm.masked.scatter.v1f32.v1p0f32(<1 x float>, <1 x float*>, i32, <1 x i1>) define void @mscatter_v1f32(<1 x float> %val, <1 x float*> %ptrs, <1 x i1> %m) { -; RV32-LABEL: mscatter_v1f32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v1f32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v1f32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v1f32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v1f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB62_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: .LBB62_2: # %else +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v1f32.v1p0f32(<1 x float> %val, <1 x float*> %ptrs, i32 4, <1 x i1> %m) ret void } @@ -1248,17 +7310,59 @@ define void @mscatter_v1f32(<1 x float> %val, <1 x float*> %ptrs, <1 x i1> %m) { declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float>, <2 x float*>, i32, <2 x i1>) define void @mscatter_v2f32(<2 x float> %val, <2 x float*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2f32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2f32: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2f32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2f32: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB63_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB63_4 +; RV64ZVE32F-NEXT: .LBB63_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB63_3: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB63_2 +; RV64ZVE32F-NEXT: .LBB63_4: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %val, <2 x float*> %ptrs, i32 4, <2 x i1> %m) ret void } @@ -1277,6 +7381,64 @@ define void @mscatter_v4f32(<4 x float> %val, <4 x float*> %ptrs, <4 x i1> %m) { ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v4f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a1) +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: bnez a5, .LBB64_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB64_6 +; RV64ZVE32F-NEXT: .LBB64_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB64_7 +; RV64ZVE32F-NEXT: .LBB64_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB64_8 +; RV64ZVE32F-NEXT: .LBB64_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB64_5: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB64_2 +; RV64ZVE32F-NEXT: .LBB64_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v9, (a4) +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB64_3 +; RV64ZVE32F-NEXT: .LBB64_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v9, (a3) +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB64_4 +; RV64ZVE32F-NEXT: .LBB64_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %val, <4 x float*> %ptrs, i32 4, <4 x i1> %m) ret void } @@ -1293,6 +7455,64 @@ define void @mscatter_truemask_v4f32(<4 x float> %val, <4 x float*> %ptrs) { ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10 ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_truemask_v4f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 8(a0) +; RV64ZVE32F-NEXT: ld a4, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v10, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v9, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB65_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB65_6 +; RV64ZVE32F-NEXT: .LBB65_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB65_7 +; RV64ZVE32F-NEXT: .LBB65_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB65_8 +; RV64ZVE32F-NEXT: .LBB65_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB65_5: # %cond.store +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a4) +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB65_2 +; RV64ZVE32F-NEXT: .LBB65_6: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v9, (a3) +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB65_3 +; RV64ZVE32F-NEXT: .LBB65_7: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v9, (a2) +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB65_4 +; RV64ZVE32F-NEXT: .LBB65_8: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %val, <4 x float*> %ptrs, i32 4, <4 x i1> %mtrue) @@ -1321,6 +7541,90 @@ define void @mscatter_v8f32(<8 x float> %val, <8 x float*> %ptrs, <8 x i1> %m) { ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a1, 56(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi t1, a3, 1 +; RV64ZVE32F-NEXT: bnez t1, .LBB67_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB67_10 +; RV64ZVE32F-NEXT: .LBB67_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB67_11 +; RV64ZVE32F-NEXT: .LBB67_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB67_12 +; RV64ZVE32F-NEXT: .LBB67_4: # %else6 +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB67_13 +; RV64ZVE32F-NEXT: .LBB67_5: # %else8 +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB67_14 +; RV64ZVE32F-NEXT: .LBB67_6: # %else10 +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: bnez a0, .LBB67_15 +; RV64ZVE32F-NEXT: .LBB67_7: # %else12 +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB67_16 +; RV64ZVE32F-NEXT: .LBB67_8: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB67_9: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB67_2 +; RV64ZVE32F-NEXT: .LBB67_10: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v10, (t0) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB67_3 +; RV64ZVE32F-NEXT: .LBB67_11: # %cond.store3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v10, (a7) +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB67_4 +; RV64ZVE32F-NEXT: .LBB67_12: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v10, (a6) +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB67_5 +; RV64ZVE32F-NEXT: .LBB67_13: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v10, (a5) +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB67_6 +; RV64ZVE32F-NEXT: .LBB67_14: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v10, (a4) +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: beqz a0, .LBB67_7 +; RV64ZVE32F-NEXT: .LBB67_15: # %cond.store11 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB67_8 +; RV64ZVE32F-NEXT: .LBB67_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a1) +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) ret void } @@ -1342,6 +7646,114 @@ define void @mscatter_baseidx_v8i8_v8f32(<8 x float> %val, float* %base, <8 x i8 ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB68_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB68_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB68_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB68_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB68_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB68_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB68_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB68_14 +; RV64ZVE32F-NEXT: .LBB68_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB68_10 +; RV64ZVE32F-NEXT: .LBB68_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB68_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB68_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB68_16 +; RV64ZVE32F-NEXT: .LBB68_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB68_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB68_8 +; RV64ZVE32F-NEXT: .LBB68_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB68_9 +; RV64ZVE32F-NEXT: j .LBB68_10 +; RV64ZVE32F-NEXT: .LBB68_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB68_12 +; RV64ZVE32F-NEXT: .LBB68_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds float, float* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) ret void @@ -1364,6 +7776,114 @@ define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, float* %base, <8 ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB69_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB69_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB69_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB69_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB69_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB69_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB69_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB69_14 +; RV64ZVE32F-NEXT: .LBB69_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB69_10 +; RV64ZVE32F-NEXT: .LBB69_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB69_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB69_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB69_16 +; RV64ZVE32F-NEXT: .LBB69_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB69_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB69_8 +; RV64ZVE32F-NEXT: .LBB69_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB69_9 +; RV64ZVE32F-NEXT: j .LBB69_10 +; RV64ZVE32F-NEXT: .LBB69_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB69_12 +; RV64ZVE32F-NEXT: .LBB69_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) @@ -1387,6 +7907,122 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, float* %base, <8 ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB70_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB70_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB70_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB70_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB70_14 +; RV64ZVE32F-NEXT: .LBB70_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_10 +; RV64ZVE32F-NEXT: .LBB70_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB70_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB70_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB70_16 +; RV64ZVE32F-NEXT: .LBB70_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB70_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_8 +; RV64ZVE32F-NEXT: .LBB70_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB70_9 +; RV64ZVE32F-NEXT: j .LBB70_10 +; RV64ZVE32F-NEXT: .LBB70_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB70_12 +; RV64ZVE32F-NEXT: .LBB70_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: andi a1, a1, 255 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) @@ -1410,6 +8046,115 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, float* %base, <8 x i ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i16_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB71_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB71_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB71_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB71_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB71_14 +; RV64ZVE32F-NEXT: .LBB71_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_10 +; RV64ZVE32F-NEXT: .LBB71_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB71_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB71_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB71_16 +; RV64ZVE32F-NEXT: .LBB71_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB71_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_8 +; RV64ZVE32F-NEXT: .LBB71_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB71_9 +; RV64ZVE32F-NEXT: j .LBB71_10 +; RV64ZVE32F-NEXT: .LBB71_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB71_12 +; RV64ZVE32F-NEXT: .LBB71_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds float, float* %base, <8 x i16> %idxs call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) ret void @@ -1432,6 +8177,115 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, float* %base, < ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB72_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB72_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB72_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB72_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB72_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB72_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB72_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB72_14 +; RV64ZVE32F-NEXT: .LBB72_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB72_10 +; RV64ZVE32F-NEXT: .LBB72_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB72_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB72_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB72_16 +; RV64ZVE32F-NEXT: .LBB72_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB72_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB72_8 +; RV64ZVE32F-NEXT: .LBB72_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB72_9 +; RV64ZVE32F-NEXT: j .LBB72_10 +; RV64ZVE32F-NEXT: .LBB72_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB72_12 +; RV64ZVE32F-NEXT: .LBB72_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) @@ -1455,6 +8309,125 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, float* %base, < ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: lui a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: addiw a1, a1, -1 +; RV64ZVE32F-NEXT: beqz a3, .LBB73_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a3) +; RV64ZVE32F-NEXT: .LBB73_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB73_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: .LBB73_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB73_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: .LBB73_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB73_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB73_14 +; RV64ZVE32F-NEXT: .LBB73_8: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB73_10 +; RV64ZVE32F-NEXT: .LBB73_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: .LBB73_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB73_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: bnez a2, .LBB73_16 +; RV64ZVE32F-NEXT: .LBB73_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB73_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB73_8 +; RV64ZVE32F-NEXT: .LBB73_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB73_9 +; RV64ZVE32F-NEXT: j .LBB73_10 +; RV64ZVE32F-NEXT: .LBB73_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB73_12 +; RV64ZVE32F-NEXT: .LBB73_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: and a1, a2, a1 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %eidxs call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) @@ -1477,6 +8450,117 @@ define void @mscatter_baseidx_v8f32(<8 x float> %val, float* %base, <8 x i32> %i ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8f32: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vse32.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB74_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB74_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_13 +; RV64ZVE32F-NEXT: .LBB74_6: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_14 +; RV64ZVE32F-NEXT: .LBB74_7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_9 +; RV64ZVE32F-NEXT: .LBB74_8: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB74_9: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_15 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB74_16 +; RV64ZVE32F-NEXT: .LBB74_11: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB74_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v14, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_6 +; RV64ZVE32F-NEXT: .LBB74_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_7 +; RV64ZVE32F-NEXT: .LBB74_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_8 +; RV64ZVE32F-NEXT: j .LBB74_9 +; RV64ZVE32F-NEXT: .LBB74_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB74_11 +; RV64ZVE32F-NEXT: .LBB74_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: slli a1, a1, 2 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV64ZVE32F-NEXT: vse32.v v8, (a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds float, float* %base, <8 x i32> %idxs call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %val, <8 x float*> %ptrs, i32 4, <8 x i1> %m) ret void @@ -1485,17 +8569,45 @@ define void @mscatter_baseidx_v8f32(<8 x float> %val, float* %base, <8 x i32> %i declare void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double>, <1 x double*>, i32, <1 x i1>) define void @mscatter_v1f64(<1 x double> %val, <1 x double*> %ptrs, <1 x i1> %m) { -; RV32-LABEL: mscatter_v1f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v1f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v1f64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v1f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vmv.x.s a0, v9 +; RV32ZVE32F-NEXT: andi a0, a0, 1 +; RV32ZVE32F-NEXT: beqz a0, .LBB75_2 +; RV32ZVE32F-NEXT: # %bb.1: # %cond.store +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: .LBB75_2: # %else +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v1f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: andi a1, a1, 1 +; RV64ZVE32F-NEXT: beqz a1, .LBB75_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: .LBB75_2: # %else +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> %val, <1 x double*> %ptrs, i32 8, <1 x i1> %m) ret void } @@ -1503,17 +8615,88 @@ define void @mscatter_v1f64(<1 x double> %val, <1 x double*> %ptrs, <1 x i1> %m) declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>) define void @mscatter_v2f64(<2 x double> %val, <2 x double*> %ptrs, <2 x i1> %m) { -; RV32-LABEL: mscatter_v2f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v2f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v2f64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v2f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi a0, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (a0) +; RV32ZVE32F-NEXT: lbu a0, 15(sp) +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB76_3 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a0, a0, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB76_4 +; RV32ZVE32F-NEXT: .LBB76_2: # %else2 +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB76_3: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB76_2 +; RV32ZVE32F-NEXT: .LBB76_4: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa1, 0(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v2f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a2, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a2) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: bnez a3, .LBB76_3 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB76_4 +; RV64ZVE32F-NEXT: .LBB76_2: # %else2 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB76_3: # %cond.store +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB76_2 +; RV64ZVE32F-NEXT: .LBB76_4: # %cond.store1 +; RV64ZVE32F-NEXT: fsd fa1, 0(a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32 8, <2 x i1> %m) ret void } @@ -1521,33 +8704,251 @@ define void @mscatter_v2f64(<2 x double> %val, <2 x double*> %ptrs, <2 x i1> %m) declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double>, <4 x double*>, i32, <4 x i1>) define void @mscatter_v4f64(<4 x double> %val, <4 x double*> %ptrs, <4 x i1> %m) { -; RV32-LABEL: mscatter_v4f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v4f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v4f64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v4f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi a0, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (a0) +; RV32ZVE32F-NEXT: lbu a0, 15(sp) +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB77_5 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB77_6 +; RV32ZVE32F-NEXT: .LBB77_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB77_7 +; RV32ZVE32F-NEXT: .LBB77_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a0, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB77_8 +; RV32ZVE32F-NEXT: .LBB77_4: # %else6 +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB77_5: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB77_2 +; RV32ZVE32F-NEXT: .LBB77_6: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB77_3 +; RV32ZVE32F-NEXT: .LBB77_7: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB77_4 +; RV32ZVE32F-NEXT: .LBB77_8: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa3, 0(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v4f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a1, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a1) +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: lbu a2, 15(sp) +; RV64ZVE32F-NEXT: ld a3, 16(a0) +; RV64ZVE32F-NEXT: ld a4, 8(a0) +; RV64ZVE32F-NEXT: andi a5, a2, 1 +; RV64ZVE32F-NEXT: bnez a5, .LBB77_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB77_6 +; RV64ZVE32F-NEXT: .LBB77_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB77_7 +; RV64ZVE32F-NEXT: .LBB77_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB77_8 +; RV64ZVE32F-NEXT: .LBB77_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB77_5: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a2, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB77_2 +; RV64ZVE32F-NEXT: .LBB77_6: # %cond.store1 +; RV64ZVE32F-NEXT: fsd fa1, 0(a4) +; RV64ZVE32F-NEXT: andi a0, a2, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB77_3 +; RV64ZVE32F-NEXT: .LBB77_7: # %cond.store3 +; RV64ZVE32F-NEXT: fsd fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a0, a2, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB77_4 +; RV64ZVE32F-NEXT: .LBB77_8: # %cond.store5 +; RV64ZVE32F-NEXT: fsd fa3, 0(a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %val, <4 x double*> %ptrs, i32 8, <4 x i1> %m) ret void } define void @mscatter_truemask_v4f64(<4 x double> %val, <4 x double*> %ptrs) { -; RV32-LABEL: mscatter_truemask_v4f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v10 -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_truemask_v4f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v10 +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_truemask_v4f64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v10 ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_truemask_v4f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -16 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmset.m v0 +; RV32ZVE32F-NEXT: vmv.v.i v9, 0 +; RV32ZVE32F-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmv.v.i v10, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 0 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV32ZVE32F-NEXT: vmsne.vi v9, v10, 0 +; RV32ZVE32F-NEXT: addi a0, sp, 15 +; RV32ZVE32F-NEXT: vsm.v v9, (a0) +; RV32ZVE32F-NEXT: lb a0, 15(sp) +; RV32ZVE32F-NEXT: beqz zero, .LBB78_5 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB78_6 +; RV32ZVE32F-NEXT: .LBB78_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB78_7 +; RV32ZVE32F-NEXT: .LBB78_3: # %else4 +; RV32ZVE32F-NEXT: andi a0, a0, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB78_8 +; RV32ZVE32F-NEXT: .LBB78_4: # %else6 +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB78_5: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB78_2 +; RV32ZVE32F-NEXT: .LBB78_6: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB78_3 +; RV32ZVE32F-NEXT: .LBB78_7: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v9 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB78_4 +; RV32ZVE32F-NEXT: .LBB78_8: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa3, 0(a0) +; RV32ZVE32F-NEXT: addi sp, sp, 16 +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_truemask_v4f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: ld a1, 24(a0) +; RV64ZVE32F-NEXT: ld a2, 16(a0) +; RV64ZVE32F-NEXT: ld a3, 8(a0) +; RV64ZVE32F-NEXT: ld a4, 0(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmset.m v0 +; RV64ZVE32F-NEXT: vmv.v.i v8, 0 +; RV64ZVE32F-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.v.i v9, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, mu +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: vmsne.vi v8, v9, 0 +; RV64ZVE32F-NEXT: addi a0, sp, 15 +; RV64ZVE32F-NEXT: vsm.v v8, (a0) +; RV64ZVE32F-NEXT: lb a0, 15(sp) +; RV64ZVE32F-NEXT: beqz zero, .LBB78_5 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB78_6 +; RV64ZVE32F-NEXT: .LBB78_2: # %else2 +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB78_7 +; RV64ZVE32F-NEXT: .LBB78_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB78_8 +; RV64ZVE32F-NEXT: .LBB78_4: # %else6 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB78_5: # %cond.store +; RV64ZVE32F-NEXT: fsd fa0, 0(a4) +; RV64ZVE32F-NEXT: andi a4, a0, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB78_2 +; RV64ZVE32F-NEXT: .LBB78_6: # %cond.store1 +; RV64ZVE32F-NEXT: fsd fa1, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a0, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB78_3 +; RV64ZVE32F-NEXT: .LBB78_7: # %cond.store3 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: andi a0, a0, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB78_4 +; RV64ZVE32F-NEXT: .LBB78_8: # %cond.store5 +; RV64ZVE32F-NEXT: fsd fa3, 0(a1) +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: ret %mhead = insertelement <4 x i1> poison, i1 1, i32 0 %mtrue = shufflevector <4 x i1> %mhead, <4 x i1> poison, <4 x i32> zeroinitializer call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %val, <4 x double*> %ptrs, i32 8, <4 x i1> %mtrue) @@ -1565,30 +8966,183 @@ define void @mscatter_falsemask_v4f64(<4 x double> %val, <4 x double*> %ptrs) { declare void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double>, <8 x double*>, i32, <8 x i1>) define void @mscatter_v8f64(<8 x double> %val, <8 x double*> %ptrs, <8 x i1> %m) { -; RV32-LABEL: mscatter_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_v8f64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB80_9 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB80_10 +; RV32ZVE32F-NEXT: .LBB80_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB80_11 +; RV32ZVE32F-NEXT: .LBB80_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB80_12 +; RV32ZVE32F-NEXT: .LBB80_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB80_13 +; RV32ZVE32F-NEXT: .LBB80_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB80_14 +; RV32ZVE32F-NEXT: .LBB80_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB80_15 +; RV32ZVE32F-NEXT: .LBB80_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB80_16 +; RV32ZVE32F-NEXT: .LBB80_8: # %else14 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB80_9: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB80_2 +; RV32ZVE32F-NEXT: .LBB80_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB80_3 +; RV32ZVE32F-NEXT: .LBB80_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB80_4 +; RV32ZVE32F-NEXT: .LBB80_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB80_5 +; RV32ZVE32F-NEXT: .LBB80_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB80_6 +; RV32ZVE32F-NEXT: .LBB80_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB80_7 +; RV32ZVE32F-NEXT: .LBB80_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB80_8 +; RV32ZVE32F-NEXT: .LBB80_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa7, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld a1, 56(a0) +; RV64ZVE32F-NEXT: ld a2, 48(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 32(a0) +; RV64ZVE32F-NEXT: ld a6, 24(a0) +; RV64ZVE32F-NEXT: ld a7, 16(a0) +; RV64ZVE32F-NEXT: ld t0, 8(a0) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi t1, a3, 1 +; RV64ZVE32F-NEXT: bnez t1, .LBB80_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB80_10 +; RV64ZVE32F-NEXT: .LBB80_2: # %else2 +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB80_11 +; RV64ZVE32F-NEXT: .LBB80_3: # %else4 +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: bnez a0, .LBB80_12 +; RV64ZVE32F-NEXT: .LBB80_4: # %else6 +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB80_13 +; RV64ZVE32F-NEXT: .LBB80_5: # %else8 +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB80_14 +; RV64ZVE32F-NEXT: .LBB80_6: # %else10 +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: bnez a0, .LBB80_15 +; RV64ZVE32F-NEXT: .LBB80_7: # %else12 +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB80_16 +; RV64ZVE32F-NEXT: .LBB80_8: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB80_9: # %cond.store +; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: fsd fa0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a3, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB80_2 +; RV64ZVE32F-NEXT: .LBB80_10: # %cond.store1 +; RV64ZVE32F-NEXT: fsd fa1, 0(t0) +; RV64ZVE32F-NEXT: andi a0, a3, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB80_3 +; RV64ZVE32F-NEXT: .LBB80_11: # %cond.store3 +; RV64ZVE32F-NEXT: fsd fa2, 0(a7) +; RV64ZVE32F-NEXT: andi a0, a3, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB80_4 +; RV64ZVE32F-NEXT: .LBB80_12: # %cond.store5 +; RV64ZVE32F-NEXT: fsd fa3, 0(a6) +; RV64ZVE32F-NEXT: andi a0, a3, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB80_5 +; RV64ZVE32F-NEXT: .LBB80_13: # %cond.store7 +; RV64ZVE32F-NEXT: fsd fa4, 0(a5) +; RV64ZVE32F-NEXT: andi a0, a3, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB80_6 +; RV64ZVE32F-NEXT: .LBB80_14: # %cond.store9 +; RV64ZVE32F-NEXT: fsd fa5, 0(a4) +; RV64ZVE32F-NEXT: andi a0, a3, 64 +; RV64ZVE32F-NEXT: beqz a0, .LBB80_7 +; RV64ZVE32F-NEXT: .LBB80_15: # %cond.store11 +; RV64ZVE32F-NEXT: fsd fa6, 0(a2) +; RV64ZVE32F-NEXT: andi a0, a3, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB80_8 +; RV64ZVE32F-NEXT: .LBB80_16: # %cond.store13 +; RV64ZVE32F-NEXT: fsd fa7, 0(a1) +; RV64ZVE32F-NEXT: ret call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m) ret void } define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, double* %base, <8 x i8> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_v8i8_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsext.vf4 v14, v12 -; RV32-NEXT: vsll.vi v12, v14, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_v8i8_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsext.vf4 v14, v12 +; RV32V-NEXT: vsll.vi v12, v14, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_v8i8_v8f64: ; RV64: # %bb.0: @@ -1597,22 +9151,203 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, double* %base, <8 x ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_v8i8_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB81_9 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB81_10 +; RV32ZVE32F-NEXT: .LBB81_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB81_11 +; RV32ZVE32F-NEXT: .LBB81_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB81_12 +; RV32ZVE32F-NEXT: .LBB81_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB81_13 +; RV32ZVE32F-NEXT: .LBB81_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB81_14 +; RV32ZVE32F-NEXT: .LBB81_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB81_15 +; RV32ZVE32F-NEXT: .LBB81_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB81_16 +; RV32ZVE32F-NEXT: .LBB81_8: # %else14 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB81_9: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB81_2 +; RV32ZVE32F-NEXT: .LBB81_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB81_3 +; RV32ZVE32F-NEXT: .LBB81_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB81_4 +; RV32ZVE32F-NEXT: .LBB81_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB81_5 +; RV32ZVE32F-NEXT: .LBB81_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB81_6 +; RV32ZVE32F-NEXT: .LBB81_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB81_7 +; RV32ZVE32F-NEXT: .LBB81_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB81_8 +; RV32ZVE32F-NEXT: .LBB81_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa7, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa0, 0(a2) +; RV64ZVE32F-NEXT: .LBB81_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa1, 0(a2) +; RV64ZVE32F-NEXT: .LBB81_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB81_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB81_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB81_14 +; RV64ZVE32F-NEXT: .LBB81_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_10 +; RV64ZVE32F-NEXT: .LBB81_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB81_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB81_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB81_16 +; RV64ZVE32F-NEXT: .LBB81_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB81_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_8 +; RV64ZVE32F-NEXT: .LBB81_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB81_9 +; RV64ZVE32F-NEXT: j .LBB81_10 +; RV64ZVE32F-NEXT: .LBB81_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa6, 0(a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB81_12 +; RV64ZVE32F-NEXT: .LBB81_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 3 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: fsd fa7, 0(a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds double, double* %base, <8 x i8> %idxs call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m) ret void } define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, double* %base, <8 x i8> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_sext_v8i8_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf8 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_sext_v8i8_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf8 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_sext_v8i8_v8f64: ; RV64: # %bb.0: @@ -1621,6 +9356,187 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, double* %base, ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB82_9 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB82_10 +; RV32ZVE32F-NEXT: .LBB82_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB82_11 +; RV32ZVE32F-NEXT: .LBB82_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB82_12 +; RV32ZVE32F-NEXT: .LBB82_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB82_13 +; RV32ZVE32F-NEXT: .LBB82_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB82_14 +; RV32ZVE32F-NEXT: .LBB82_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB82_15 +; RV32ZVE32F-NEXT: .LBB82_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB82_16 +; RV32ZVE32F-NEXT: .LBB82_8: # %else14 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB82_9: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB82_2 +; RV32ZVE32F-NEXT: .LBB82_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB82_3 +; RV32ZVE32F-NEXT: .LBB82_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB82_4 +; RV32ZVE32F-NEXT: .LBB82_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB82_5 +; RV32ZVE32F-NEXT: .LBB82_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB82_6 +; RV32ZVE32F-NEXT: .LBB82_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB82_7 +; RV32ZVE32F-NEXT: .LBB82_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB82_8 +; RV32ZVE32F-NEXT: .LBB82_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa7, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa0, 0(a2) +; RV64ZVE32F-NEXT: .LBB82_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa1, 0(a2) +; RV64ZVE32F-NEXT: .LBB82_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB82_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB82_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB82_14 +; RV64ZVE32F-NEXT: .LBB82_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_10 +; RV64ZVE32F-NEXT: .LBB82_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB82_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB82_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB82_16 +; RV64ZVE32F-NEXT: .LBB82_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB82_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_8 +; RV64ZVE32F-NEXT: .LBB82_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB82_9 +; RV64ZVE32F-NEXT: j .LBB82_10 +; RV64ZVE32F-NEXT: .LBB82_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa6, 0(a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB82_12 +; RV64ZVE32F-NEXT: .LBB82_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 3 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: fsd fa7, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m) @@ -1628,16 +9544,16 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, double* %base, } define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, double* %base, <8 x i8> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf8 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_zext_v8i8_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf8 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_zext_v8i8_v8f64: ; RV64: # %bb.0: @@ -1646,6 +9562,195 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, double* %base, ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vzext.vf4 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB83_9 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB83_10 +; RV32ZVE32F-NEXT: .LBB83_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB83_11 +; RV32ZVE32F-NEXT: .LBB83_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB83_12 +; RV32ZVE32F-NEXT: .LBB83_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB83_13 +; RV32ZVE32F-NEXT: .LBB83_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB83_14 +; RV32ZVE32F-NEXT: .LBB83_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB83_15 +; RV32ZVE32F-NEXT: .LBB83_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB83_16 +; RV32ZVE32F-NEXT: .LBB83_8: # %else14 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB83_9: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB83_2 +; RV32ZVE32F-NEXT: .LBB83_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB83_3 +; RV32ZVE32F-NEXT: .LBB83_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB83_4 +; RV32ZVE32F-NEXT: .LBB83_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB83_5 +; RV32ZVE32F-NEXT: .LBB83_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB83_6 +; RV32ZVE32F-NEXT: .LBB83_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB83_7 +; RV32ZVE32F-NEXT: .LBB83_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB83_8 +; RV32ZVE32F-NEXT: .LBB83_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa7, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa0, 0(a2) +; RV64ZVE32F-NEXT: .LBB83_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa1, 0(a2) +; RV64ZVE32F-NEXT: .LBB83_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB83_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB83_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB83_14 +; RV64ZVE32F-NEXT: .LBB83_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_10 +; RV64ZVE32F-NEXT: .LBB83_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB83_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB83_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB83_16 +; RV64ZVE32F-NEXT: .LBB83_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB83_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_8 +; RV64ZVE32F-NEXT: .LBB83_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB83_9 +; RV64ZVE32F-NEXT: j .LBB83_10 +; RV64ZVE32F-NEXT: .LBB83_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa6, 0(a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB83_12 +; RV64ZVE32F-NEXT: .LBB83_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: andi a1, a1, 255 +; RV64ZVE32F-NEXT: slli a1, a1, 3 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: fsd fa7, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m) @@ -1653,14 +9758,14 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, double* %base, } define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, double* %base, <8 x i16> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_v8i16_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsext.vf2 v14, v12 -; RV32-NEXT: vsll.vi v12, v14, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_v8i16_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsext.vf2 v14, v12 +; RV32V-NEXT: vsll.vi v12, v14, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_v8i16_v8f64: ; RV64: # %bb.0: @@ -1669,22 +9774,204 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, double* %base, <8 x ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_v8i16_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB84_9 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB84_10 +; RV32ZVE32F-NEXT: .LBB84_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB84_11 +; RV32ZVE32F-NEXT: .LBB84_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB84_12 +; RV32ZVE32F-NEXT: .LBB84_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB84_13 +; RV32ZVE32F-NEXT: .LBB84_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB84_14 +; RV32ZVE32F-NEXT: .LBB84_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB84_15 +; RV32ZVE32F-NEXT: .LBB84_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB84_16 +; RV32ZVE32F-NEXT: .LBB84_8: # %else14 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB84_9: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB84_2 +; RV32ZVE32F-NEXT: .LBB84_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB84_3 +; RV32ZVE32F-NEXT: .LBB84_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB84_4 +; RV32ZVE32F-NEXT: .LBB84_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB84_5 +; RV32ZVE32F-NEXT: .LBB84_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB84_6 +; RV32ZVE32F-NEXT: .LBB84_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB84_7 +; RV32ZVE32F-NEXT: .LBB84_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB84_8 +; RV32ZVE32F-NEXT: .LBB84_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa7, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i16_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa0, 0(a2) +; RV64ZVE32F-NEXT: .LBB84_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa1, 0(a2) +; RV64ZVE32F-NEXT: .LBB84_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB84_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB84_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB84_14 +; RV64ZVE32F-NEXT: .LBB84_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_10 +; RV64ZVE32F-NEXT: .LBB84_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB84_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB84_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB84_16 +; RV64ZVE32F-NEXT: .LBB84_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB84_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_8 +; RV64ZVE32F-NEXT: .LBB84_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB84_9 +; RV64ZVE32F-NEXT: j .LBB84_10 +; RV64ZVE32F-NEXT: .LBB84_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa6, 0(a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB84_12 +; RV64ZVE32F-NEXT: .LBB84_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 3 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: fsd fa7, 0(a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds double, double* %base, <8 x i16> %idxs call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m) ret void } define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, double* %base, <8 x i16> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf4 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_sext_v8i16_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf4 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_sext_v8i16_v8f64: ; RV64: # %bb.0: @@ -1693,6 +9980,188 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, double* %base, ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB85_9 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB85_10 +; RV32ZVE32F-NEXT: .LBB85_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB85_11 +; RV32ZVE32F-NEXT: .LBB85_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB85_12 +; RV32ZVE32F-NEXT: .LBB85_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB85_13 +; RV32ZVE32F-NEXT: .LBB85_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB85_14 +; RV32ZVE32F-NEXT: .LBB85_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB85_15 +; RV32ZVE32F-NEXT: .LBB85_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB85_16 +; RV32ZVE32F-NEXT: .LBB85_8: # %else14 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB85_9: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB85_2 +; RV32ZVE32F-NEXT: .LBB85_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB85_3 +; RV32ZVE32F-NEXT: .LBB85_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB85_4 +; RV32ZVE32F-NEXT: .LBB85_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB85_5 +; RV32ZVE32F-NEXT: .LBB85_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB85_6 +; RV32ZVE32F-NEXT: .LBB85_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB85_7 +; RV32ZVE32F-NEXT: .LBB85_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB85_8 +; RV32ZVE32F-NEXT: .LBB85_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa7, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa0, 0(a2) +; RV64ZVE32F-NEXT: .LBB85_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa1, 0(a2) +; RV64ZVE32F-NEXT: .LBB85_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB85_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB85_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB85_14 +; RV64ZVE32F-NEXT: .LBB85_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_10 +; RV64ZVE32F-NEXT: .LBB85_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB85_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB85_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB85_16 +; RV64ZVE32F-NEXT: .LBB85_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB85_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_8 +; RV64ZVE32F-NEXT: .LBB85_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB85_9 +; RV64ZVE32F-NEXT: j .LBB85_10 +; RV64ZVE32F-NEXT: .LBB85_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa6, 0(a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB85_12 +; RV64ZVE32F-NEXT: .LBB85_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 3 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: fsd fa7, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m) @@ -1700,16 +10169,16 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, double* %base, } define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, double* %base, <8 x i16> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf4 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_zext_v8i16_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf4 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_zext_v8i16_v8f64: ; RV64: # %bb.0: @@ -1718,6 +10187,198 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, double* %base, ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB86_9 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB86_10 +; RV32ZVE32F-NEXT: .LBB86_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB86_11 +; RV32ZVE32F-NEXT: .LBB86_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB86_12 +; RV32ZVE32F-NEXT: .LBB86_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB86_13 +; RV32ZVE32F-NEXT: .LBB86_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB86_14 +; RV32ZVE32F-NEXT: .LBB86_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB86_15 +; RV32ZVE32F-NEXT: .LBB86_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB86_16 +; RV32ZVE32F-NEXT: .LBB86_8: # %else14 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB86_9: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB86_2 +; RV32ZVE32F-NEXT: .LBB86_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB86_3 +; RV32ZVE32F-NEXT: .LBB86_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB86_4 +; RV32ZVE32F-NEXT: .LBB86_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB86_5 +; RV32ZVE32F-NEXT: .LBB86_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB86_6 +; RV32ZVE32F-NEXT: .LBB86_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB86_7 +; RV32ZVE32F-NEXT: .LBB86_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB86_8 +; RV32ZVE32F-NEXT: .LBB86_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa7, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: lui a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: addiw a1, a1, -1 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: fsd fa0, 0(a3) +; RV64ZVE32F-NEXT: .LBB86_2: # %else +; RV64ZVE32F-NEXT: andi a3, a2, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: fsd fa1, 0(a3) +; RV64ZVE32F-NEXT: .LBB86_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: fsd fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB86_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_14 +; RV64ZVE32F-NEXT: .LBB86_8: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_10 +; RV64ZVE32F-NEXT: .LBB86_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: fsd fa5, 0(a3) +; RV64ZVE32F-NEXT: .LBB86_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: bnez a2, .LBB86_16 +; RV64ZVE32F-NEXT: .LBB86_12: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB86_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: fsd fa3, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_8 +; RV64ZVE32F-NEXT: .LBB86_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: fsd fa4, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_9 +; RV64ZVE32F-NEXT: j .LBB86_10 +; RV64ZVE32F-NEXT: .LBB86_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: fsd fa6, 0(a3) +; RV64ZVE32F-NEXT: andi a2, a2, -128 +; RV64ZVE32F-NEXT: beqz a2, .LBB86_12 +; RV64ZVE32F-NEXT: .LBB86_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: and a1, a2, a1 +; RV64ZVE32F-NEXT: slli a1, a1, 3 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: fsd fa7, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m) @@ -1725,13 +10386,13 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, double* %base, } define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, double* %base, <8 x i32> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_v8i32_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vsll.vi v12, v12, 3 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_v8i32_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32V-NEXT: vsll.vi v12, v12, 3 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_v8i32_v8f64: ; RV64: # %bb.0: @@ -1740,22 +10401,205 @@ define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, double* %base, <8 x ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_v8i32_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB87_9 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB87_10 +; RV32ZVE32F-NEXT: .LBB87_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB87_11 +; RV32ZVE32F-NEXT: .LBB87_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB87_12 +; RV32ZVE32F-NEXT: .LBB87_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB87_13 +; RV32ZVE32F-NEXT: .LBB87_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB87_14 +; RV32ZVE32F-NEXT: .LBB87_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB87_15 +; RV32ZVE32F-NEXT: .LBB87_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB87_16 +; RV32ZVE32F-NEXT: .LBB87_8: # %else14 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB87_9: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB87_2 +; RV32ZVE32F-NEXT: .LBB87_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB87_3 +; RV32ZVE32F-NEXT: .LBB87_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB87_4 +; RV32ZVE32F-NEXT: .LBB87_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB87_5 +; RV32ZVE32F-NEXT: .LBB87_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB87_6 +; RV32ZVE32F-NEXT: .LBB87_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB87_7 +; RV32ZVE32F-NEXT: .LBB87_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB87_8 +; RV32ZVE32F-NEXT: .LBB87_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa7, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8i32_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa0, 0(a2) +; RV64ZVE32F-NEXT: .LBB87_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa1, 0(a2) +; RV64ZVE32F-NEXT: .LBB87_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB87_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB87_13 +; RV64ZVE32F-NEXT: .LBB87_6: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB87_14 +; RV64ZVE32F-NEXT: .LBB87_7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_9 +; RV64ZVE32F-NEXT: .LBB87_8: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB87_9: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB87_15 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB87_16 +; RV64ZVE32F-NEXT: .LBB87_11: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB87_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_6 +; RV64ZVE32F-NEXT: .LBB87_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_7 +; RV64ZVE32F-NEXT: .LBB87_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB87_8 +; RV64ZVE32F-NEXT: j .LBB87_9 +; RV64ZVE32F-NEXT: .LBB87_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa6, 0(a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB87_11 +; RV64ZVE32F-NEXT: .LBB87_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 3 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: fsd fa7, 0(a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds double, double* %base, <8 x i32> %idxs call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m) ret void } define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, double* %base, <8 x i32> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_sext_v8i32_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsext.vf2 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_sext_v8i32_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsext.vf2 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_sext_v8i32_v8f64: ; RV64: # %bb.0: @@ -1764,6 +10608,189 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, double* %base, ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB88_9 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB88_10 +; RV32ZVE32F-NEXT: .LBB88_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB88_11 +; RV32ZVE32F-NEXT: .LBB88_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB88_12 +; RV32ZVE32F-NEXT: .LBB88_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB88_13 +; RV32ZVE32F-NEXT: .LBB88_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB88_14 +; RV32ZVE32F-NEXT: .LBB88_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB88_15 +; RV32ZVE32F-NEXT: .LBB88_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB88_16 +; RV32ZVE32F-NEXT: .LBB88_8: # %else14 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB88_9: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB88_2 +; RV32ZVE32F-NEXT: .LBB88_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB88_3 +; RV32ZVE32F-NEXT: .LBB88_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB88_4 +; RV32ZVE32F-NEXT: .LBB88_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB88_5 +; RV32ZVE32F-NEXT: .LBB88_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB88_6 +; RV32ZVE32F-NEXT: .LBB88_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB88_7 +; RV32ZVE32F-NEXT: .LBB88_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB88_8 +; RV32ZVE32F-NEXT: .LBB88_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa7, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa0, 0(a2) +; RV64ZVE32F-NEXT: .LBB88_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa1, 0(a2) +; RV64ZVE32F-NEXT: .LBB88_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB88_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB88_13 +; RV64ZVE32F-NEXT: .LBB88_6: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB88_14 +; RV64ZVE32F-NEXT: .LBB88_7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_9 +; RV64ZVE32F-NEXT: .LBB88_8: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB88_9: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB88_15 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB88_16 +; RV64ZVE32F-NEXT: .LBB88_11: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB88_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_6 +; RV64ZVE32F-NEXT: .LBB88_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_7 +; RV64ZVE32F-NEXT: .LBB88_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB88_8 +; RV64ZVE32F-NEXT: j .LBB88_9 +; RV64ZVE32F-NEXT: .LBB88_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa6, 0(a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB88_11 +; RV64ZVE32F-NEXT: .LBB88_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 3 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: fsd fa7, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = sext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m) @@ -1771,16 +10798,16 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, double* %base, } define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, double* %base, <8 x i32> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_zext_v8i32_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vzext.vf2 v16, v12 -; RV32-NEXT: vsll.vi v12, v16, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_zext_v8i32_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vzext.vf2 v16, v12 +; RV32V-NEXT: vsll.vi v12, v16, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_zext_v8i32_v8f64: ; RV64: # %bb.0: @@ -1789,6 +10816,197 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, double* %base, ; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB89_9 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB89_10 +; RV32ZVE32F-NEXT: .LBB89_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB89_11 +; RV32ZVE32F-NEXT: .LBB89_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB89_12 +; RV32ZVE32F-NEXT: .LBB89_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB89_13 +; RV32ZVE32F-NEXT: .LBB89_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB89_14 +; RV32ZVE32F-NEXT: .LBB89_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB89_15 +; RV32ZVE32F-NEXT: .LBB89_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB89_16 +; RV32ZVE32F-NEXT: .LBB89_8: # %else14 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB89_9: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB89_2 +; RV32ZVE32F-NEXT: .LBB89_10: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB89_3 +; RV32ZVE32F-NEXT: .LBB89_11: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB89_4 +; RV32ZVE32F-NEXT: .LBB89_12: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB89_5 +; RV32ZVE32F-NEXT: .LBB89_13: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB89_6 +; RV32ZVE32F-NEXT: .LBB89_14: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB89_7 +; RV32ZVE32F-NEXT: .LBB89_15: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB89_8 +; RV32ZVE32F-NEXT: .LBB89_16: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa7, 0(a0) +; RV32ZVE32F-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 32 +; RV64ZVE32F-NEXT: srli a2, a2, 29 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa0, 0(a2) +; RV64ZVE32F-NEXT: .LBB89_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 32 +; RV64ZVE32F-NEXT: srli a2, a2, 29 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa1, 0(a2) +; RV64ZVE32F-NEXT: .LBB89_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB89_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB89_13 +; RV64ZVE32F-NEXT: .LBB89_6: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB89_14 +; RV64ZVE32F-NEXT: .LBB89_7: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_9 +; RV64ZVE32F-NEXT: .LBB89_8: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 32 +; RV64ZVE32F-NEXT: srli a2, a2, 29 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB89_9: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB89_15 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB89_16 +; RV64ZVE32F-NEXT: .LBB89_11: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB89_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 32 +; RV64ZVE32F-NEXT: srli a2, a2, 29 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_6 +; RV64ZVE32F-NEXT: .LBB89_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 32 +; RV64ZVE32F-NEXT: srli a2, a2, 29 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_7 +; RV64ZVE32F-NEXT: .LBB89_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 32 +; RV64ZVE32F-NEXT: srli a2, a2, 29 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB89_8 +; RV64ZVE32F-NEXT: j .LBB89_9 +; RV64ZVE32F-NEXT: .LBB89_15: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 32 +; RV64ZVE32F-NEXT: srli a2, a2, 29 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa6, 0(a2) +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB89_11 +; RV64ZVE32F-NEXT: .LBB89_16: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v8 +; RV64ZVE32F-NEXT: slli a1, a1, 32 +; RV64ZVE32F-NEXT: srli a1, a1, 29 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: fsd fa7, 0(a0) +; RV64ZVE32F-NEXT: ret %eidxs = zext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %eidxs call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m) @@ -1796,15 +11014,15 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, double* %base, } define void @mscatter_baseidx_v8f64(<8 x double> %val, double* %base, <8 x i64> %idxs, <8 x i1> %m) { -; RV32-LABEL: mscatter_baseidx_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vsll.vi v12, v12, 3 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vncvt.x.x.w v16, v12 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vsoxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: ret +; RV32V-LABEL: mscatter_baseidx_v8f64: +; RV32V: # %bb.0: +; RV32V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32V-NEXT: vsll.vi v12, v12, 3 +; RV32V-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32V-NEXT: vncvt.x.x.w v16, v12 +; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32V-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; RV32V-NEXT: ret ; ; RV64-LABEL: mscatter_baseidx_v8f64: ; RV64: # %bb.0: @@ -1812,6 +11030,209 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, double* %base, <8 x i64> ; RV64-NEXT: vsll.vi v12, v12, 3 ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret +; +; RV32ZVE32F-LABEL: mscatter_baseidx_v8f64: +; RV32ZVE32F: # %bb.0: +; RV32ZVE32F-NEXT: addi sp, sp, -96 +; RV32ZVE32F-NEXT: .cfi_def_cfa_offset 96 +; RV32ZVE32F-NEXT: sw ra, 92(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: sw s0, 88(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: .cfi_offset ra, -4 +; RV32ZVE32F-NEXT: .cfi_offset s0, -8 +; RV32ZVE32F-NEXT: addi s0, sp, 96 +; RV32ZVE32F-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVE32F-NEXT: andi sp, sp, -32 +; RV32ZVE32F-NEXT: lw a2, 0(a1) +; RV32ZVE32F-NEXT: lw a3, 8(a1) +; RV32ZVE32F-NEXT: lw a4, 16(a1) +; RV32ZVE32F-NEXT: lw a5, 24(a1) +; RV32ZVE32F-NEXT: lw a6, 56(a1) +; RV32ZVE32F-NEXT: lw a7, 48(a1) +; RV32ZVE32F-NEXT: lw t0, 40(a1) +; RV32ZVE32F-NEXT: lw a1, 32(a1) +; RV32ZVE32F-NEXT: sw a6, 60(sp) +; RV32ZVE32F-NEXT: sw a7, 56(sp) +; RV32ZVE32F-NEXT: sw t0, 52(sp) +; RV32ZVE32F-NEXT: sw a1, 48(sp) +; RV32ZVE32F-NEXT: sw a5, 44(sp) +; RV32ZVE32F-NEXT: sw a4, 40(sp) +; RV32ZVE32F-NEXT: sw a3, 36(sp) +; RV32ZVE32F-NEXT: sw a2, 32(sp) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32ZVE32F-NEXT: addi a1, sp, 32 +; RV32ZVE32F-NEXT: vle32.v v8, (a1) +; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 +; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a0, v0 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB90_10 +; RV32ZVE32F-NEXT: # %bb.1: # %else +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB90_11 +; RV32ZVE32F-NEXT: .LBB90_2: # %else2 +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB90_12 +; RV32ZVE32F-NEXT: .LBB90_3: # %else4 +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB90_13 +; RV32ZVE32F-NEXT: .LBB90_4: # %else6 +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB90_14 +; RV32ZVE32F-NEXT: .LBB90_5: # %else8 +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB90_15 +; RV32ZVE32F-NEXT: .LBB90_6: # %else10 +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB90_16 +; RV32ZVE32F-NEXT: .LBB90_7: # %else12 +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: beqz a0, .LBB90_9 +; RV32ZVE32F-NEXT: .LBB90_8: # %cond.store13 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa7, 0(a0) +; RV32ZVE32F-NEXT: .LBB90_9: # %else14 +; RV32ZVE32F-NEXT: addi sp, s0, -96 +; RV32ZVE32F-NEXT: lw ra, 92(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: lw s0, 88(sp) # 4-byte Folded Reload +; RV32ZVE32F-NEXT: addi sp, sp, 96 +; RV32ZVE32F-NEXT: ret +; RV32ZVE32F-NEXT: .LBB90_10: # %cond.store +; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB90_2 +; RV32ZVE32F-NEXT: .LBB90_11: # %cond.store1 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB90_3 +; RV32ZVE32F-NEXT: .LBB90_12: # %cond.store3 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB90_4 +; RV32ZVE32F-NEXT: .LBB90_13: # %cond.store5 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB90_5 +; RV32ZVE32F-NEXT: .LBB90_14: # %cond.store7 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB90_6 +; RV32ZVE32F-NEXT: .LBB90_15: # %cond.store9 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a0, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB90_7 +; RV32ZVE32F-NEXT: .LBB90_16: # %cond.store11 +; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: bnez a0, .LBB90_8 +; RV32ZVE32F-NEXT: j .LBB90_9 +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v8f64: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: ld t1, 8(a1) +; RV64ZVE32F-NEXT: ld t0, 16(a1) +; RV64ZVE32F-NEXT: ld a7, 24(a1) +; RV64ZVE32F-NEXT: ld a6, 32(a1) +; RV64ZVE32F-NEXT: ld a5, 40(a1) +; RV64ZVE32F-NEXT: ld a4, 48(a1) +; RV64ZVE32F-NEXT: ld a2, 56(a1) +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a3, v0 +; RV64ZVE32F-NEXT: andi t2, a3, 1 +; RV64ZVE32F-NEXT: bnez t2, .LBB90_9 +; RV64ZVE32F-NEXT: # %bb.1: # %else +; RV64ZVE32F-NEXT: andi a1, a3, 2 +; RV64ZVE32F-NEXT: bnez a1, .LBB90_10 +; RV64ZVE32F-NEXT: .LBB90_2: # %else2 +; RV64ZVE32F-NEXT: andi a1, a3, 4 +; RV64ZVE32F-NEXT: bnez a1, .LBB90_11 +; RV64ZVE32F-NEXT: .LBB90_3: # %else4 +; RV64ZVE32F-NEXT: andi a1, a3, 8 +; RV64ZVE32F-NEXT: bnez a1, .LBB90_12 +; RV64ZVE32F-NEXT: .LBB90_4: # %else6 +; RV64ZVE32F-NEXT: andi a1, a3, 16 +; RV64ZVE32F-NEXT: bnez a1, .LBB90_13 +; RV64ZVE32F-NEXT: .LBB90_5: # %else8 +; RV64ZVE32F-NEXT: andi a1, a3, 32 +; RV64ZVE32F-NEXT: bnez a1, .LBB90_14 +; RV64ZVE32F-NEXT: .LBB90_6: # %else10 +; RV64ZVE32F-NEXT: andi a1, a3, 64 +; RV64ZVE32F-NEXT: bnez a1, .LBB90_15 +; RV64ZVE32F-NEXT: .LBB90_7: # %else12 +; RV64ZVE32F-NEXT: andi a1, a3, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB90_16 +; RV64ZVE32F-NEXT: .LBB90_8: # %else14 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB90_9: # %cond.store +; RV64ZVE32F-NEXT: ld a1, 0(a1) +; RV64ZVE32F-NEXT: slli a1, a1, 3 +; RV64ZVE32F-NEXT: add a1, a0, a1 +; RV64ZVE32F-NEXT: fsd fa0, 0(a1) +; RV64ZVE32F-NEXT: andi a1, a3, 2 +; RV64ZVE32F-NEXT: beqz a1, .LBB90_2 +; RV64ZVE32F-NEXT: .LBB90_10: # %cond.store1 +; RV64ZVE32F-NEXT: slli a1, t1, 3 +; RV64ZVE32F-NEXT: add a1, a0, a1 +; RV64ZVE32F-NEXT: fsd fa1, 0(a1) +; RV64ZVE32F-NEXT: andi a1, a3, 4 +; RV64ZVE32F-NEXT: beqz a1, .LBB90_3 +; RV64ZVE32F-NEXT: .LBB90_11: # %cond.store3 +; RV64ZVE32F-NEXT: slli a1, t0, 3 +; RV64ZVE32F-NEXT: add a1, a0, a1 +; RV64ZVE32F-NEXT: fsd fa2, 0(a1) +; RV64ZVE32F-NEXT: andi a1, a3, 8 +; RV64ZVE32F-NEXT: beqz a1, .LBB90_4 +; RV64ZVE32F-NEXT: .LBB90_12: # %cond.store5 +; RV64ZVE32F-NEXT: slli a1, a7, 3 +; RV64ZVE32F-NEXT: add a1, a0, a1 +; RV64ZVE32F-NEXT: fsd fa3, 0(a1) +; RV64ZVE32F-NEXT: andi a1, a3, 16 +; RV64ZVE32F-NEXT: beqz a1, .LBB90_5 +; RV64ZVE32F-NEXT: .LBB90_13: # %cond.store7 +; RV64ZVE32F-NEXT: slli a1, a6, 3 +; RV64ZVE32F-NEXT: add a1, a0, a1 +; RV64ZVE32F-NEXT: fsd fa4, 0(a1) +; RV64ZVE32F-NEXT: andi a1, a3, 32 +; RV64ZVE32F-NEXT: beqz a1, .LBB90_6 +; RV64ZVE32F-NEXT: .LBB90_14: # %cond.store9 +; RV64ZVE32F-NEXT: slli a1, a5, 3 +; RV64ZVE32F-NEXT: add a1, a0, a1 +; RV64ZVE32F-NEXT: fsd fa5, 0(a1) +; RV64ZVE32F-NEXT: andi a1, a3, 64 +; RV64ZVE32F-NEXT: beqz a1, .LBB90_7 +; RV64ZVE32F-NEXT: .LBB90_15: # %cond.store11 +; RV64ZVE32F-NEXT: slli a1, a4, 3 +; RV64ZVE32F-NEXT: add a1, a0, a1 +; RV64ZVE32F-NEXT: fsd fa6, 0(a1) +; RV64ZVE32F-NEXT: andi a1, a3, -128 +; RV64ZVE32F-NEXT: beqz a1, .LBB90_8 +; RV64ZVE32F-NEXT: .LBB90_16: # %cond.store13 +; RV64ZVE32F-NEXT: slli a1, a2, 3 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: fsd fa7, 0(a0) +; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds double, double* %base, <8 x i64> %idxs call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> %val, <8 x double*> %ptrs, i32 8, <8 x i1> %m) ret void @@ -1835,6 +11256,206 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, i8* %base, <16 x i8> %idxs, ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v16i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vse8.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB91_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB91_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB91_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_28 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_29 +; RV64ZVE32F-NEXT: .LBB91_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_10 +; RV64ZVE32F-NEXT: .LBB91_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB91_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_30 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_31 +; RV64ZVE32F-NEXT: .LBB91_12: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_32 +; RV64ZVE32F-NEXT: .LBB91_13: # %else16 +; RV64ZVE32F-NEXT: andi a2, a1, 512 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_15 +; RV64ZVE32F-NEXT: .LBB91_14: # %cond.store17 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 9 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB91_15: # %else18 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 1024 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_17 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.store19 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 10 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB91_17: # %else20 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: lui a2, 1 +; RV64ZVE32F-NEXT: addiw a3, a2, -2048 +; RV64ZVE32F-NEXT: and a3, a1, a3 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_19 +; RV64ZVE32F-NEXT: # %bb.18: # %cond.store21 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 11 +; RV64ZVE32F-NEXT: vse8.v v10, (a3) +; RV64ZVE32F-NEXT: .LBB91_19: # %else22 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_21 +; RV64ZVE32F-NEXT: # %bb.20: # %cond.store23 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 12 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB91_21: # %else24 +; RV64ZVE32F-NEXT: lui a2, 2 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_23 +; RV64ZVE32F-NEXT: # %bb.22: # %cond.store25 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 13 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB91_23: # %else26 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 4 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_25 +; RV64ZVE32F-NEXT: # %bb.24: # %cond.store27 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 14 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB91_25: # %else28 +; RV64ZVE32F-NEXT: lui a2, 1048568 +; RV64ZVE32F-NEXT: and a1, a1, a2 +; RV64ZVE32F-NEXT: beqz a1, .LBB91_27 +; RV64ZVE32F-NEXT: # %bb.26: # %cond.store29 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 15 +; RV64ZVE32F-NEXT: vse8.v v8, (a0) +; RV64ZVE32F-NEXT: .LBB91_27: # %else30 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB91_28: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_8 +; RV64ZVE32F-NEXT: .LBB91_29: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 4 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_9 +; RV64ZVE32F-NEXT: j .LBB91_10 +; RV64ZVE32F-NEXT: .LBB91_30: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 6 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_12 +; RV64ZVE32F-NEXT: .LBB91_31: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_13 +; RV64ZVE32F-NEXT: .LBB91_32: # %cond.store15 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 8 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 512 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_14 +; RV64ZVE32F-NEXT: j .LBB91_15 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %idxs call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %val, <16 x i8*> %ptrs, i32 1, <16 x i1> %m) ret void @@ -1868,6 +11489,401 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, i8* %base, <32 x i8> %idxs, ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: ret +; +; RV64ZVE32F-LABEL: mscatter_baseidx_v32i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a1, v0 +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.store +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vse8.v v8, (a2) +; RV64ZVE32F-NEXT: .LBB92_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_6: # %else4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v10, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_60 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_61 +; RV64ZVE32F-NEXT: .LBB92_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_10 +; RV64ZVE32F-NEXT: .LBB92_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 5 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_10: # %else10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_62 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_63 +; RV64ZVE32F-NEXT: .LBB92_12: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_64 +; RV64ZVE32F-NEXT: .LBB92_13: # %else16 +; RV64ZVE32F-NEXT: andi a2, a1, 512 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_15 +; RV64ZVE32F-NEXT: .LBB92_14: # %cond.store17 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 9 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_15: # %else18 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: andi a2, a1, 1024 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_17 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.store19 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 10 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_17: # %else20 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: lui a2, 1 +; RV64ZVE32F-NEXT: addiw a3, a2, -2048 +; RV64ZVE32F-NEXT: and a3, a1, a3 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB92_19 +; RV64ZVE32F-NEXT: # %bb.18: # %cond.store21 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v13 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 11 +; RV64ZVE32F-NEXT: vse8.v v14, (a3) +; RV64ZVE32F-NEXT: .LBB92_19: # %else22 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, ta, mu +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_21 +; RV64ZVE32F-NEXT: # %bb.20: # %cond.store23 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 12 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_21: # %else24 +; RV64ZVE32F-NEXT: lui a2, 2 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_23 +; RV64ZVE32F-NEXT: # %bb.22: # %cond.store25 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 13 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_23: # %else26 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 4 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_25 +; RV64ZVE32F-NEXT: # %bb.24: # %cond.store27 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 14 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_25: # %else28 +; RV64ZVE32F-NEXT: lui a2, 8 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_27 +; RV64ZVE32F-NEXT: # %bb.26: # %cond.store29 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 15 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_27: # %else30 +; RV64ZVE32F-NEXT: lui a2, 16 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_29 +; RV64ZVE32F-NEXT: # %bb.28: # %cond.store31 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 16 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_29: # %else32 +; RV64ZVE32F-NEXT: lui a2, 32 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_31 +; RV64ZVE32F-NEXT: # %bb.30: # %cond.store33 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 17 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_31: # %else34 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 64 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_33 +; RV64ZVE32F-NEXT: # %bb.32: # %cond.store35 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 18 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_33: # %else36 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: lui a2, 128 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_35 +; RV64ZVE32F-NEXT: # %bb.34: # %cond.store37 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 19 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_35: # %else38 +; RV64ZVE32F-NEXT: lui a2, 256 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_37 +; RV64ZVE32F-NEXT: # %bb.36: # %cond.store39 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 20 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_37: # %else40 +; RV64ZVE32F-NEXT: lui a2, 512 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_39 +; RV64ZVE32F-NEXT: # %bb.38: # %cond.store41 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 21 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: .LBB92_39: # %else42 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 1024 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_41 +; RV64ZVE32F-NEXT: # %bb.40: # %cond.store43 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 22 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_41: # %else44 +; RV64ZVE32F-NEXT: lui a2, 2048 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_43 +; RV64ZVE32F-NEXT: # %bb.42: # %cond.store45 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 23 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_43: # %else46 +; RV64ZVE32F-NEXT: lui a2, 4096 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_45 +; RV64ZVE32F-NEXT: # %bb.44: # %cond.store47 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 24 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_45: # %else48 +; RV64ZVE32F-NEXT: lui a2, 8192 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_47 +; RV64ZVE32F-NEXT: # %bb.46: # %cond.store49 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 25 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_47: # %else50 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 16384 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_49 +; RV64ZVE32F-NEXT: # %bb.48: # %cond.store51 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 26 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_49: # %else52 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; RV64ZVE32F-NEXT: lui a2, 32768 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_51 +; RV64ZVE32F-NEXT: # %bb.50: # %cond.store53 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 27 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_51: # %else54 +; RV64ZVE32F-NEXT: lui a2, 65536 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_53 +; RV64ZVE32F-NEXT: # %bb.52: # %cond.store55 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 28 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_53: # %else56 +; RV64ZVE32F-NEXT: lui a2, 131072 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_55 +; RV64ZVE32F-NEXT: # %bb.54: # %cond.store57 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 29 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_55: # %else58 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: lui a2, 262144 +; RV64ZVE32F-NEXT: and a2, a1, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_57 +; RV64ZVE32F-NEXT: # %bb.56: # %cond.store59 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 30 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB92_57: # %else60 +; RV64ZVE32F-NEXT: lui a2, 524288 +; RV64ZVE32F-NEXT: and a1, a1, a2 +; RV64ZVE32F-NEXT: beqz a1, .LBB92_59 +; RV64ZVE32F-NEXT: # %bb.58: # %cond.store61 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: add a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 31 +; RV64ZVE32F-NEXT: vse8.v v8, (a0) +; RV64ZVE32F-NEXT: .LBB92_59: # %else62 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB92_60: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_8 +; RV64ZVE32F-NEXT: .LBB92_61: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 4 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_9 +; RV64ZVE32F-NEXT: j .LBB92_10 +; RV64ZVE32F-NEXT: .LBB92_62: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 6 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 128 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_12 +; RV64ZVE32F-NEXT: .LBB92_63: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 7 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_13 +; RV64ZVE32F-NEXT: .LBB92_64: # %cond.store15 +; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, mu +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 8 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 512 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_14 +; RV64ZVE32F-NEXT: j .LBB92_15 %ptrs = getelementptr inbounds i8, i8* %base, <32 x i8> %idxs call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> %val, <32 x i8*> %ptrs, i32 1, <32 x i1> %m) ret void