diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 9ccdc95e9ba4..2eea380c49bd 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -6903,19 +6903,22 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) { // TODO: Can we put this a common method for DAG? auto SkipRCCopy = [](SDValue V) { while (V->isMachineOpcode() && - V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS) + V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS) { + // All values in the chain should have single use. + if (V->use_empty() || !V->use_begin()->isOnlyUserOf(V.getNode())) + return SDValue(); V = V->getOperand(0); - return V; + } + return V.hasOneUse() ? V : SDValue(); }; SDValue VecOp = SkipRCCopy(N->getOperand(0)); - if (!isLaneInsensitive(VecOp) || !VecOp.hasOneUse()) + if (!VecOp || !isLaneInsensitive(VecOp)) return; SDValue LHS = SkipRCCopy(VecOp.getOperand(0)), RHS = SkipRCCopy(VecOp.getOperand(1)); - if (!LHS.hasOneUse() || !RHS.hasOneUse() || !isVSXSwap(LHS) || - !isVSXSwap(RHS)) + if (!LHS || !RHS || !isVSXSwap(LHS) || !isVSXSwap(RHS)) return; // These swaps may still have chain-uses here, count on dead code elimination diff --git a/llvm/test/CodeGen/PowerPC/swap-reduction.ll b/llvm/test/CodeGen/PowerPC/swap-reduction.ll index eb1f5b728b03..a2e7176654f8 100644 --- a/llvm/test/CodeGen/PowerPC/swap-reduction.ll +++ b/llvm/test/CodeGen/PowerPC/swap-reduction.ll @@ -83,5 +83,31 @@ entry: ret i16 %2 } +define signext i32 @vecop_uses2([4 x i32]* %a, [4 x i32]* %b, [4 x i32]* %c) { +; CHECK-LABEL: vecop_uses2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvd2x 0, 0, 3 +; CHECK-NEXT: lxvd2x 1, 0, 4 +; CHECK-NEXT: xxswapd 34, 0 +; CHECK-NEXT: xxswapd 35, 1 +; CHECK-NEXT: xxsldwi 0, 34, 34, 3 +; CHECK-NEXT: vmuluwm 2, 3, 2 +; CHECK-NEXT: mffprwz 3, 0 +; CHECK-NEXT: xxswapd 0, 34 +; CHECK-NEXT: extsw 3, 3 +; CHECK-NEXT: stxvd2x 0, 0, 5 +; CHECK-NEXT: blr +entry: + %0 = bitcast [4 x i32]* %a to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast [4 x i32]* %b to <4 x i32>* + %3 = load <4 x i32>, <4 x i32>* %2, align 4 + %4 = mul <4 x i32> %3, %1 + %5 = bitcast [4 x i32]* %c to <4 x i32>* + store <4 x i32> %4, <4 x i32>* %5, align 4 + %6 = extractelement <4 x i32> %1, i32 3 + ret i32 %6 +} + declare <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8>, <16 x i8>) declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)