forked from OSchip/llvm-project
[AArch64][GlobalISel] Make vector dup optimization look at last elt of ZeroVec
Fix an off-by-one error which made us not look at the last element of the zero vector. This caused a miscompile in 188.ammp. Differential Revision: https://reviews.llvm.org/D65168 llvm-svn: 366930
This commit is contained in:
parent
c913d1f2d6
commit
c19c30776a
|
@ -3523,7 +3523,7 @@ bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
|
|||
int64_t Zero = 0;
|
||||
if (!mi_match(ZeroVec->getOperand(1).getReg(), MRI, m_ICst(Zero)) || Zero)
|
||||
return false;
|
||||
for (unsigned i = 1, e = ZeroVec->getNumOperands() - 1; i < e; ++i) {
|
||||
for (unsigned i = 1, e = ZeroVec->getNumOperands(); i < e; ++i) {
|
||||
if (ZeroVec->getOperand(i).getReg() != ZeroVec->getOperand(1).getReg())
|
||||
return false; // This wasn't an all zeros vector.
|
||||
}
|
||||
|
|
|
@ -120,6 +120,14 @@ body: |
|
|||
|
||||
; This test is exactly the same as splat_2xf64, except it adds two copies.
|
||||
; These copies shouldn't get in the way of matching the dup pattern.
|
||||
; CHECK-LABEL: name: splat_2xf64_copies
|
||||
; CHECK: liveins: $d0
|
||||
; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
|
||||
; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
|
||||
; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub
|
||||
; CHECK: [[DUPv2i64lane:%[0-9]+]]:fpr128 = DUPv2i64lane [[INSERT_SUBREG]], 0
|
||||
; CHECK: $q0 = COPY [[DUPv2i64lane]]
|
||||
; CHECK: RET_ReallyLR implicit $q0
|
||||
%0:fpr(s64) = COPY $d0
|
||||
%2:fpr(<2 x s64>) = G_IMPLICIT_DEF
|
||||
%6:fpr(<2 x s64>) = COPY %2
|
||||
|
@ -130,3 +138,35 @@ body: |
|
|||
%4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %7(<2 x s64>), %2, %5(<2 x s32>)
|
||||
$q0 = COPY %4(<2 x s64>)
|
||||
RET_ReallyLR implicit $q0
|
||||
|
||||
...
|
||||
---
|
||||
name: not_all_zeros
|
||||
alignment: 2
|
||||
legalized: true
|
||||
regBankSelected: true
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.1.entry:
|
||||
liveins: $x0
|
||||
; Make sure that we don't do the optimization when it's not all zeroes.
|
||||
; CHECK-LABEL: name: not_all_zeros
|
||||
; CHECK: liveins: $x0
|
||||
; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
|
||||
; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
|
||||
; CHECK: [[INSvi64gpr:%[0-9]+]]:fpr128 = INSvi64gpr [[DEF]], 0, [[COPY]]
|
||||
; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
|
||||
; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
|
||||
; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[INSvi64gpr]], %subreg.qsub0, [[DEF]], %subreg.qsub1
|
||||
; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]]
|
||||
; CHECK: $q0 = COPY [[TBLv16i8Two]]
|
||||
; CHECK: RET_ReallyLR implicit $q0
|
||||
%0:gpr(s64) = COPY $x0
|
||||
%2:fpr(<2 x s64>) = G_IMPLICIT_DEF
|
||||
%3:gpr(s32) = G_CONSTANT i32 0
|
||||
%6:gpr(s32) = G_CONSTANT i32 1
|
||||
%5:fpr(<2 x s32>) = G_BUILD_VECTOR %3(s32), %6(s32)
|
||||
%1:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s32)
|
||||
%4:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %2, %5(<2 x s32>)
|
||||
$q0 = COPY %4(<2 x s64>)
|
||||
RET_ReallyLR implicit $q0
|
||||
|
|
Loading…
Reference in New Issue