forked from OSchip/llvm-project
Remove the pmulld intrinsic and autoupdate it as a vector multiply.
Rewrite the pmulld patterns, and make sure that they fold in loads of arguments into the instruction. llvm-svn: 99910
This commit is contained in:
parent
4be6a75884
commit
6ad8167714
|
@ -810,9 +810,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
|||
def int_x86_sse41_pmuldq : GCCBuiltin<"__builtin_ia32_pmuldq128">,
|
||||
Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
|
||||
[IntrNoMem, Commutative]>;
|
||||
def int_x86_sse41_pmulld : GCCBuiltin<"__builtin_ia32_pmulld128">,
|
||||
Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
|
||||
[IntrNoMem, Commutative]>;
|
||||
}
|
||||
|
||||
// Vector extract
|
||||
|
|
|
@ -597,7 +597,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
|
|||
{ X86::PMULHUWrr, X86::PMULHUWrm, 16 },
|
||||
{ X86::PMULHWrr, X86::PMULHWrm, 16 },
|
||||
{ X86::PMULLDrr, X86::PMULLDrm, 16 },
|
||||
{ X86::PMULLDrr_int, X86::PMULLDrm_int, 16 },
|
||||
{ X86::PMULLWrr, X86::PMULLWrm, 16 },
|
||||
{ X86::PMULUDQrr, X86::PMULUDQrm, 16 },
|
||||
{ X86::PORrr, X86::PORrm, 16 },
|
||||
|
|
|
@ -3448,8 +3448,28 @@ let Constraints = "$src1 = $dst" in {
|
|||
OpSize;
|
||||
}
|
||||
}
|
||||
defm PMULLD : SS41I_binop_patint<0x40, "pmulld", v4i32, mul,
|
||||
int_x86_sse41_pmulld, 1>;
|
||||
|
||||
/// SS48I_binop_rm - Simple SSE41 binary operator.
|
||||
let Constraints = "$src1 = $dst" in {
|
||||
multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||
ValueType OpVT, bit Commutable = 0> {
|
||||
def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
|
||||
(ins VR128:$src1, VR128:$src2),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
[(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>,
|
||||
OpSize {
|
||||
let isCommutable = Commutable;
|
||||
}
|
||||
def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
|
||||
(ins VR128:$src1, i128mem:$src2),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
|
||||
[(set VR128:$dst, (OpNode VR128:$src1,
|
||||
(bc_v4i32 (memopv2i64 addr:$src2))))]>,
|
||||
OpSize;
|
||||
}
|
||||
}
|
||||
|
||||
defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, 1>;
|
||||
|
||||
/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
|
||||
let Constraints = "$src1 = $dst" in {
|
||||
|
|
|
@ -225,7 +225,12 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
|
|||
// Calls to these intrinsics are transformed into ShuffleVector's.
|
||||
NewFn = 0;
|
||||
return true;
|
||||
} else if (Name.compare(5, 16, "x86.sse41.pmulld", 16) == 0) {
|
||||
// Calls to these intrinsics are transformed into vector multiplies.
|
||||
NewFn = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
break;
|
||||
}
|
||||
|
@ -355,6 +360,18 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
|
|||
|
||||
// Clean up the old call now that it has been completely upgraded.
|
||||
CI->eraseFromParent();
|
||||
} else if (F->getName() == "llvm.x86.sse41.pmulld") {
|
||||
// Upgrade this set of intrinsics into vector multiplies.
|
||||
Instruction *Mul = BinaryOperator::CreateMul(CI->getOperand(1),
|
||||
CI->getOperand(2),
|
||||
CI->getName(),
|
||||
CI);
|
||||
// Fix up all the uses with our new multiply.
|
||||
if (!CI->use_empty())
|
||||
CI->replaceAllUsesWith(Mul);
|
||||
|
||||
// Remove upgraded multiply.
|
||||
CI->eraseFromParent();
|
||||
} else {
|
||||
llvm_unreachable("Unknown function for CallInst upgrade.");
|
||||
}
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.pmulld}
|
||||
; RUN: llvm-dis < %s.bc | grep mul
|
Binary file not shown.
|
@ -1,6 +1,6 @@
|
|||
; RUN: llc < %s -march=x86 -mattr=sse41 -stack-alignment=16 > %t
|
||||
; RUN: grep pmul %t | count 12
|
||||
; RUN: grep mov %t | count 12
|
||||
; RUN: grep mov %t | count 11
|
||||
|
||||
define <4 x i32> @a(<4 x i32> %i) nounwind {
|
||||
%A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
; RUN: llc < %s -march=x86-64 -mattr=+sse41 -asm-verbose=0 | FileCheck %s
|
||||
|
||||
define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
|
||||
; CHECK: test1:
|
||||
; CHECK-NEXT: pmulld
|
||||
%C = mul <4 x i32> %A, %B
|
||||
ret <4 x i32> %C
|
||||
}
|
||||
|
||||
define <4 x i32> @test1a(<4 x i32> %A, <4 x i32> *%Bp) nounwind {
|
||||
; CHECK: test1a:
|
||||
; CHECK-NEXT: pmulld
|
||||
%B = load <4 x i32>* %Bp
|
||||
%C = mul <4 x i32> %A, %B
|
||||
ret <4 x i32> %C
|
||||
}
|
Loading…
Reference in New Issue