[X86] Add comments to the end of FMA3 instructions to make the operation clear

Summary:
There are 3 different operand orders for FMA instructions so figuring out the exact operation being performed requires a lot of thought.

This patch adds a comment to the end of the assembly line to print the exact operation.

I think I've got all the instructions in here except the ones with builtin rounding.

I didn't update all tests, but I assume we can get them as we regenerate tests in the future.

Reviewers: spatel, v_klochkov, RKSimon

Reviewed By: spatel

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D44345

llvm-svn: 327225
This commit is contained in:
Craig Topper 2018-03-10 21:30:46 +00:00
parent 4ccea6230b
commit d88204fe1b
9 changed files with 3732 additions and 1585 deletions

View File

@ -160,6 +160,46 @@ using namespace llvm;
CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
#define CASE_AVX512_FMA(Inst, suf) \
CASE_AVX512_INS_COMMON(Inst, Z, suf) \
CASE_AVX512_INS_COMMON(Inst, Z256, suf) \
CASE_AVX512_INS_COMMON(Inst, Z128, suf)
#define CASE_FMA(Inst, suf) \
CASE_AVX512_FMA(Inst, suf) \
CASE_AVX_INS_COMMON(Inst, , suf) \
CASE_AVX_INS_COMMON(Inst, Y, suf)
#define CASE_FMA_PACKED_REG(Inst) \
CASE_FMA(Inst##PD, r) \
CASE_FMA(Inst##PS, r)
#define CASE_FMA_PACKED_MEM(Inst) \
CASE_FMA(Inst##PD, m) \
CASE_FMA(Inst##PS, m) \
CASE_AVX512_FMA(Inst##PD, mb) \
CASE_AVX512_FMA(Inst##PS, mb)
#define CASE_FMA_SCALAR_REG(Inst) \
CASE_AVX_INS_COMMON(Inst##SD, , r) \
CASE_AVX_INS_COMMON(Inst##SS, , r) \
CASE_AVX_INS_COMMON(Inst##SD, , r_Int) \
CASE_AVX_INS_COMMON(Inst##SS, , r_Int) \
CASE_AVX_INS_COMMON(Inst##SD, Z, r) \
CASE_AVX_INS_COMMON(Inst##SS, Z, r) \
CASE_AVX512_INS_COMMON(Inst##SD, Z, r_Int) \
CASE_AVX512_INS_COMMON(Inst##SS, Z, r_Int)
#define CASE_FMA_SCALAR_MEM(Inst) \
CASE_AVX_INS_COMMON(Inst##SD, , m) \
CASE_AVX_INS_COMMON(Inst##SS, , m) \
CASE_AVX_INS_COMMON(Inst##SD, , m_Int) \
CASE_AVX_INS_COMMON(Inst##SS, , m_Int) \
CASE_AVX_INS_COMMON(Inst##SD, Z, m) \
CASE_AVX_INS_COMMON(Inst##SS, Z, m) \
CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int) \
CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int)
static unsigned getVectorRegSize(unsigned RegNo) {
if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
return 512;
@ -232,6 +272,249 @@ static void printMasking(raw_ostream &OS, const MCInst *MI,
OS << " {z}";
}
static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS,
const char *(*getRegName)(unsigned)) {
const char *Mul1Name = nullptr, *Mul2Name = nullptr, *AccName = nullptr;
unsigned NumOperands = MI->getNumOperands();
bool RegForm = false;
bool Negate = false;
StringRef AccStr = "+";
// The operands for FMA instructions without rounding fall into two forms.
// dest, src1, src2, src3
// dest, src1, mask, src2, src3
// Where src3 is either a register or 5 memory address operands. So to find
// dest and src1 we can index from the front. To find src2 and src3 we can
// index from the end by taking into account memory vs register form when
// finding src2.
switch (MI->getOpcode()) {
default:
return false;
CASE_FMA_PACKED_REG(FMADD132)
CASE_FMA_SCALAR_REG(FMADD132)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMADD132)
CASE_FMA_SCALAR_MEM(FMADD132)
AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul1Name = getRegName(MI->getOperand(1).getReg());
break;
CASE_FMA_PACKED_REG(FMADD213)
CASE_FMA_SCALAR_REG(FMADD213)
AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMADD213)
CASE_FMA_SCALAR_MEM(FMADD213)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul2Name = getRegName(MI->getOperand(1).getReg());
break;
CASE_FMA_PACKED_REG(FMADD231)
CASE_FMA_SCALAR_REG(FMADD231)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMADD231)
CASE_FMA_SCALAR_MEM(FMADD231)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
AccName = getRegName(MI->getOperand(1).getReg());
break;
CASE_FMA_PACKED_REG(FMSUB132)
CASE_FMA_SCALAR_REG(FMSUB132)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMSUB132)
CASE_FMA_SCALAR_MEM(FMSUB132)
AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul1Name = getRegName(MI->getOperand(1).getReg());
AccStr = "-";
break;
CASE_FMA_PACKED_REG(FMSUB213)
CASE_FMA_SCALAR_REG(FMSUB213)
AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMSUB213)
CASE_FMA_SCALAR_MEM(FMSUB213)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul2Name = getRegName(MI->getOperand(1).getReg());
AccStr = "-";
break;
CASE_FMA_PACKED_REG(FMSUB231)
CASE_FMA_SCALAR_REG(FMSUB231)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMSUB231)
CASE_FMA_SCALAR_MEM(FMSUB231)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
AccName = getRegName(MI->getOperand(1).getReg());
AccStr = "-";
break;
CASE_FMA_PACKED_REG(FNMADD132)
CASE_FMA_SCALAR_REG(FNMADD132)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FNMADD132)
CASE_FMA_SCALAR_MEM(FNMADD132)
AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul1Name = getRegName(MI->getOperand(1).getReg());
Negate = true;
break;
CASE_FMA_PACKED_REG(FNMADD213)
CASE_FMA_SCALAR_REG(FNMADD213)
AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FNMADD213)
CASE_FMA_SCALAR_MEM(FNMADD213)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul2Name = getRegName(MI->getOperand(1).getReg());
Negate = true;
break;
CASE_FMA_PACKED_REG(FNMADD231)
CASE_FMA_SCALAR_REG(FNMADD231)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FNMADD231)
CASE_FMA_SCALAR_MEM(FNMADD231)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
AccName = getRegName(MI->getOperand(1).getReg());
Negate = true;
break;
CASE_FMA_PACKED_REG(FNMSUB132)
CASE_FMA_SCALAR_REG(FNMSUB132)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FNMSUB132)
CASE_FMA_SCALAR_MEM(FNMSUB132)
AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul1Name = getRegName(MI->getOperand(1).getReg());
AccStr = "-";
Negate = true;
break;
CASE_FMA_PACKED_REG(FNMSUB213)
CASE_FMA_SCALAR_REG(FNMSUB213)
AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FNMSUB213)
CASE_FMA_SCALAR_MEM(FNMSUB213)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul2Name = getRegName(MI->getOperand(1).getReg());
AccStr = "-";
Negate = true;
break;
CASE_FMA_PACKED_REG(FNMSUB231)
CASE_FMA_SCALAR_REG(FNMSUB231)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FNMSUB231)
CASE_FMA_SCALAR_MEM(FNMSUB231)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
AccName = getRegName(MI->getOperand(1).getReg());
AccStr = "-";
Negate = true;
break;
CASE_FMA_PACKED_REG(FMADDSUB132)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMADDSUB132)
AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul1Name = getRegName(MI->getOperand(1).getReg());
AccStr = "+/-";
break;
CASE_FMA_PACKED_REG(FMADDSUB213)
AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMADDSUB213)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul2Name = getRegName(MI->getOperand(1).getReg());
AccStr = "+/-";
break;
CASE_FMA_PACKED_REG(FMADDSUB231)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMADDSUB231)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
AccName = getRegName(MI->getOperand(1).getReg());
AccStr = "+/-";
break;
CASE_FMA_PACKED_REG(FMSUBADD132)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMSUBADD132)
AccName = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul1Name = getRegName(MI->getOperand(1).getReg());
AccStr = "-/+";
break;
CASE_FMA_PACKED_REG(FMSUBADD213)
AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMSUBADD213)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
Mul2Name = getRegName(MI->getOperand(1).getReg());
AccStr = "-/+";
break;
CASE_FMA_PACKED_REG(FMSUBADD231)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
LLVM_FALLTHROUGH;
CASE_FMA_PACKED_MEM(FMSUBADD231)
Mul1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
AccName = getRegName(MI->getOperand(1).getReg());
AccStr = "-/+";
break;
}
const char *DestName = getRegName(MI->getOperand(0).getReg());
if (!Mul1Name) Mul1Name = "mem";
if (!Mul2Name) Mul2Name = "mem";
if (!AccName) AccName = "mem";
OS << DestName << " = ";
// TODO: Print masking information?
if (Negate)
OS << '-';
OS << '(' << Mul1Name << " * " << Mul2Name << ") " << AccStr << ' '
<< AccName;
return true;
}
//===----------------------------------------------------------------------===//
// Top Level Entrypoint
//===----------------------------------------------------------------------===//
@ -248,6 +531,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
unsigned NumOperands = MI->getNumOperands();
bool RegForm = false;
if (printFMA3Comments(MI, OS, getRegName))
return true;
switch (MI->getOpcode()) {
default:
// Not an instruction for which we can decode comments.

View File

@ -3986,10 +3986,10 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd %xmm0, %xmm3
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3
; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm0, %xmm4
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
; CHECK-NEXT: vmovapd %xmm0, %xmm5
; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5
; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
@ -4013,10 +4013,10 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm0, %xmm3
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm0, %xmm4
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
; CHECK-NEXT: vmovaps %xmm0, %xmm5
; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5
; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
@ -4041,7 +4041,7 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x d
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm0, %xmm3
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z}
; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
@ -4057,7 +4057,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x flo
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
@ -4070,10 +4070,10 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd %xmm2, %xmm3
; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3
; CHECK-NEXT: vfmadd231sd {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm2, %xmm4
; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfmadd231sd {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
; CHECK-NEXT: vmovapd %xmm2, %xmm5
; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5
; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
@ -4097,10 +4097,10 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3
; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm4
; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
; CHECK-NEXT: vmovaps %xmm2, %xmm5
; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5
; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
@ -4123,7 +4123,7 @@ define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) {
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1}
; CHECK-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
; CHECK-NEXT: vmovss %xmm0, (%rdi)
; CHECK-NEXT: retq
%a.val = load float, float* %a
@ -4150,7 +4150,7 @@ define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) {
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
; CHECK-NEXT: vmovss %xmm0, (%rdi)
; CHECK-NEXT: retq
%a.val = load float, float* %a
@ -4177,7 +4177,7 @@ define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) {
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1}
; CHECK-NEXT: vfmadd132sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
; CHECK-NEXT: retq
%a.val = load double, double* %a
@ -4200,7 +4200,7 @@ define void @fmadd_sd_maskz_memfold(double* %a, double* %b, i8 %c) {
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vfmadd132sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
; CHECK-NEXT: retq
%a.val = load double, double* %a
@ -4224,10 +4224,10 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd %xmm2, %xmm3
; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3
; CHECK-NEXT: vfmsub231sd {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm2, %xmm4
; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfmsub231sd {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
; CHECK-NEXT: vmovapd %xmm2, %xmm5
; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5
; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
@ -4251,10 +4251,10 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3
; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm4
; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
; CHECK-NEXT: vmovaps %xmm2, %xmm5
; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5
; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
@ -4278,10 +4278,10 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd %xmm2, %xmm3
; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3
; CHECK-NEXT: vfnmsub231sd {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm2, %xmm4
; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfnmsub231sd {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
; CHECK-NEXT: vmovapd %xmm2, %xmm5
; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5
; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
@ -4305,10 +4305,10 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3
; CHECK-NEXT: vfnmsub231ss {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm4
; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfnmsub231ss {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
; CHECK-NEXT: vmovaps %xmm2, %xmm5
; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5
; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
@ -4330,7 +4330,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm1 = (xmm0 * mem) + xmm1
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%q = load float, float* %ptr_b
@ -4343,7 +4343,7 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x f
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
; CHECK-NEXT: retq
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@ -4357,7 +4357,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
; CHECK: ## %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0

View File

@ -4092,6 +4092,7 @@ define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
; CHECK-NEXT: ## ymm0 = (ymm0 * ymm1) + ymm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
@ -4104,6 +4105,7 @@ define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
; CHECK-NEXT: ## xmm0 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
@ -4116,6 +4118,7 @@ define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
; CHECK-NEXT: ## ymm0 = (ymm0 * ymm1) + ymm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
ret <4 x double> %res
@ -4128,6 +4131,7 @@ define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
; CHECK-NEXT: ## xmm0 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
ret <2 x double> %res
@ -4139,7 +4143,9 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) + xmm2
; CHECK-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
; CHECK-NEXT: ## xmm0 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
@ -4156,7 +4162,9 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) + xmm2
; CHECK-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd1]
; CHECK-NEXT: ## xmm2 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
@ -4173,7 +4181,9 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) + xmm2
; CHECK-NEXT: vfmadd213pd %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0xa8,0xca]
; CHECK-NEXT: ## xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
@ -4188,7 +4198,9 @@ define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) + ymm2
; CHECK-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
; CHECK-NEXT: ## ymm0 = (ymm0 * ymm1) + ymm2
; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
@ -4205,7 +4217,9 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) + ymm2
; CHECK-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd1]
; CHECK-NEXT: ## ymm2 = (ymm0 * ymm1) + ymm2
; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
@ -4222,7 +4236,9 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) + ymm2
; CHECK-NEXT: vfmadd213pd %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0xa8,0xca]
; CHECK-NEXT: ## ymm1 = (ymm0 * ymm1) + ymm2
; CHECK-NEXT: vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
@ -4237,7 +4253,9 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) + xmm2
; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
; CHECK-NEXT: ## xmm0 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
@ -4254,7 +4272,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) + xmm2
; CHECK-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd1]
; CHECK-NEXT: ## xmm2 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
@ -4271,7 +4291,9 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) + xmm2
; CHECK-NEXT: vfmadd213ps %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0xa8,0xca]
; CHECK-NEXT: ## xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
@ -4286,7 +4308,9 @@ define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) + ymm2
; CHECK-NEXT: vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
; CHECK-NEXT: ## ymm0 = (ymm0 * ymm1) + ymm2
; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
@ -4303,7 +4327,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) + ymm2
; CHECK-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd1]
; CHECK-NEXT: ## ymm2 = (ymm0 * ymm1) + ymm2
; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
@ -4320,7 +4346,9 @@ define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) + ymm2
; CHECK-NEXT: vfmadd213ps %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0xa8,0xca]
; CHECK-NEXT: ## ymm1 = (ymm0 * ymm1) + ymm2
; CHECK-NEXT: vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
@ -4338,7 +4366,9 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaa,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) - xmm2
; CHECK-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd1]
; CHECK-NEXT: ## xmm2 = (xmm0 * xmm1) - xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
@ -4356,7 +4386,9 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xaa,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) - ymm2
; CHECK-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd1]
; CHECK-NEXT: ## ymm2 = (ymm0 * ymm1) - ymm2
; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
@ -4373,7 +4405,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaa,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) - xmm2
; CHECK-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd1]
; CHECK-NEXT: ## xmm2 = (xmm0 * xmm1) - xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
@ -4390,7 +4424,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xaa,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) - ymm2
; CHECK-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd1]
; CHECK-NEXT: ## ymm2 = (ymm0 * ymm1) - ymm2
; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
@ -4406,6 +4442,7 @@ define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
; CHECK-NEXT: ## ymm0 = -(ymm0 * ymm1) + ymm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
@ -4418,6 +4455,7 @@ define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
; CHECK-NEXT: ## xmm0 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
@ -4430,6 +4468,7 @@ define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1,
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
; CHECK-NEXT: ## ymm0 = -(ymm0 * ymm1) + ymm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
@ -4442,6 +4481,7 @@ define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1,
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
; CHECK-NEXT: ## xmm0 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
@ -4454,6 +4494,7 @@ define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
; CHECK-NEXT: ## ymm0 = -(ymm0 * ymm1) - ymm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
@ -4466,6 +4507,7 @@ define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
; CHECK-NEXT: ## xmm0 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
@ -4478,6 +4520,7 @@ define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1,
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
; CHECK-NEXT: ## ymm0 = -(ymm0 * ymm1) - ymm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
@ -4490,6 +4533,7 @@ define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1,
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
; CHECK-NEXT: ## xmm0 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
@ -4502,7 +4546,9 @@ define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xae,0xda]
; CHECK-NEXT: ## xmm3 = -(xmm0 * xmm3) - xmm2
; CHECK-NEXT: vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
; CHECK-NEXT: ## xmm0 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
@ -4519,7 +4565,9 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xae,0xda]
; CHECK-NEXT: ## xmm3 = -(xmm0 * xmm3) - xmm2
; CHECK-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd1]
; CHECK-NEXT: ## xmm2 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
@ -4534,7 +4582,9 @@ define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xae,0xda]
; CHECK-NEXT: ## ymm3 = -(ymm0 * ymm3) - ymm2
; CHECK-NEXT: vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
; CHECK-NEXT: ## ymm0 = -(ymm0 * ymm1) - ymm2
; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
@ -4551,7 +4601,9 @@ define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xae,0xda]
; CHECK-NEXT: ## ymm3 = -(ymm0 * ymm3) - ymm2
; CHECK-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd1]
; CHECK-NEXT: ## ymm2 = -(ymm0 * ymm1) - ymm2
; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
@ -4566,7 +4618,9 @@ define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xae,0xda]
; CHECK-NEXT: ## xmm3 = -(xmm0 * xmm3) - xmm2
; CHECK-NEXT: vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
; CHECK-NEXT: ## xmm0 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
@ -4583,7 +4637,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xae,0xda]
; CHECK-NEXT: ## xmm3 = -(xmm0 * xmm3) - xmm2
; CHECK-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd1]
; CHECK-NEXT: ## xmm2 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
@ -4598,7 +4654,9 @@ define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xae,0xda]
; CHECK-NEXT: ## ymm3 = -(ymm0 * ymm3) - ymm2
; CHECK-NEXT: vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
; CHECK-NEXT: ## ymm0 = -(ymm0 * ymm1) - ymm2
; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
@ -4615,7 +4673,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xae,0xda]
; CHECK-NEXT: ## ymm3 = -(ymm0 * ymm3) - ymm2
; CHECK-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd1]
; CHECK-NEXT: ## ymm2 = -(ymm0 * ymm1) - ymm2
; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
@ -4630,7 +4690,9 @@ define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xac,0xda]
; CHECK-NEXT: ## xmm3 = -(xmm0 * xmm3) + xmm2
; CHECK-NEXT: vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
; CHECK-NEXT: ## xmm0 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
@ -4645,7 +4707,9 @@ define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xac,0xda]
; CHECK-NEXT: ## ymm3 = -(ymm0 * ymm3) + ymm2
; CHECK-NEXT: vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
; CHECK-NEXT: ## ymm0 = -(ymm0 * ymm1) + ymm2
; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
@ -4660,7 +4724,9 @@ define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xac,0xda]
; CHECK-NEXT: ## xmm3 = -(xmm0 * xmm3) + xmm2
; CHECK-NEXT: vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
; CHECK-NEXT: ## xmm0 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
@ -4675,7 +4741,9 @@ define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xac,0xda]
; CHECK-NEXT: ## ymm3 = -(ymm0 * ymm3) + ymm2
; CHECK-NEXT: vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
; CHECK-NEXT: ## ymm0 = -(ymm0 * ymm1) + ymm2
; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
@ -4691,6 +4759,7 @@ define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
; CHECK-NEXT: ## ymm0 = (ymm0 * ymm1) +/- ymm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
ret <8 x float> %res
@ -4703,6 +4772,7 @@ define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
; CHECK-NEXT: ## xmm0 = (xmm0 * xmm1) +/- xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
ret <4 x float> %res
@ -4715,6 +4785,7 @@ define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
; CHECK-NEXT: ## ymm0 = (ymm0 * ymm1) +/- ymm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
@ -4727,6 +4798,7 @@ define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
; CHECK-NEXT: ## xmm0 = (xmm0 * xmm1) +/- xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
@ -4738,7 +4810,9 @@ define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0,
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) +/- xmm2
; CHECK-NEXT: vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
; CHECK-NEXT: ## xmm0 = (xmm0 * xmm1) +/- xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
@ -4755,7 +4829,9 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0,
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) +/- xmm2
; CHECK-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd1]
; CHECK-NEXT: ## xmm2 = (xmm0 * xmm1) +/- xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
@ -4772,7 +4848,9 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0,
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) +/- xmm2
; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0xa6,0xca]
; CHECK-NEXT: ## xmm1 = (xmm0 * xmm1) +/- xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
@ -4787,7 +4865,9 @@ define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0,
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) +/- ymm2
; CHECK-NEXT: vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
; CHECK-NEXT: ## ymm0 = (ymm0 * ymm1) +/- ymm2
; CHECK-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
@ -4804,7 +4884,9 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0,
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) +/- ymm2
; CHECK-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd1]
; CHECK-NEXT: ## ymm2 = (ymm0 * ymm1) +/- ymm2
; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
@ -4821,7 +4903,9 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0,
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) +/- ymm2
; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0xa6,0xca]
; CHECK-NEXT: ## ymm1 = (ymm0 * ymm1) +/- ymm2
; CHECK-NEXT: vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
@ -4836,7 +4920,9 @@ define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) +/- xmm2
; CHECK-NEXT: vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
; CHECK-NEXT: ## xmm0 = (xmm0 * xmm1) +/- xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
@ -4853,7 +4939,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) +/- xmm2
; CHECK-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd1]
; CHECK-NEXT: ## xmm2 = (xmm0 * xmm1) +/- xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
@ -4870,7 +4958,9 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) +/- xmm2
; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0xa6,0xca]
; CHECK-NEXT: ## xmm1 = (xmm0 * xmm1) +/- xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
@ -4885,7 +4975,9 @@ define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) +/- ymm2
; CHECK-NEXT: vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
; CHECK-NEXT: ## ymm0 = (ymm0 * ymm1) +/- ymm2
; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
@ -4902,7 +4994,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) +/- ymm2
; CHECK-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd1]
; CHECK-NEXT: ## ymm2 = (ymm0 * ymm1) +/- ymm2
; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
@ -4919,7 +5013,9 @@ define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) +/- ymm2
; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0xa6,0xca]
; CHECK-NEXT: ## ymm1 = (ymm0 * ymm1) +/- ymm2
; CHECK-NEXT: vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
@ -4936,7 +5032,9 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0,
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmsubadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa7,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) -/+ xmm2
; CHECK-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd1]
; CHECK-NEXT: ## xmm2 = (xmm0 * xmm1) -/+ xmm2
; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
@ -4953,7 +5051,9 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0,
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmsubadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa7,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) -/+ ymm2
; CHECK-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd1]
; CHECK-NEXT: ## ymm2 = (ymm0 * ymm1) -/+ ymm2
; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
@ -4970,7 +5070,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmsubadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa7,0xda]
; CHECK-NEXT: ## xmm3 = (xmm0 * xmm3) -/+ xmm2
; CHECK-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd1]
; CHECK-NEXT: ## xmm2 = (xmm0 * xmm1) -/+ xmm2
; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
@ -4987,7 +5089,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmsubadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa7,0xda]
; CHECK-NEXT: ## ymm3 = (ymm0 * ymm3) -/+ ymm2
; CHECK-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd1]
; CHECK-NEXT: ## ymm2 = (ymm0 * ymm1) -/+ ymm2
; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
@ -5002,6 +5106,7 @@ define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
; CHECK-NEXT: ## xmm0 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
@ -5011,6 +5116,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1,
; CHECK-LABEL: test_mask_vfmadd128_ps_rz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
ret <4 x float> %res
@ -5021,6 +5127,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1,
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
@ -5032,6 +5139,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
@ -5042,6 +5150,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1
; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x07]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
@ -5052,6 +5161,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a
; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x07]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
@ -5063,6 +5173,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1,
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@ -5078,6 +5189,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@ -5092,6 +5204,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1
; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@ -5106,6 +5219,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a
; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@ -5121,6 +5235,7 @@ define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
; CHECK-NEXT: ## xmm0 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
@ -5130,6 +5245,7 @@ define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a
; CHECK-LABEL: test_mask_vfmadd128_pd_rz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
ret <2 x double> %res
@ -5140,6 +5256,7 @@ define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <2 x double>, <2 x double>* %ptr_a2
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
@ -5150,6 +5267,7 @@ define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double>
; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
; CHECK-NEXT: ## xmm0 = (xmm1 * xmm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <2 x double>, <2 x double>* %ptr_a2
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
@ -5161,6 +5279,7 @@ define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
; CHECK-NEXT: ## ymm0 = (ymm0 * ymm1) + ymm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
@ -5170,6 +5289,7 @@ define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a
; CHECK-LABEL: test_mask_vfmadd256_pd_rz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
; CHECK-NEXT: ## ymm0 = (ymm1 * ymm0) + ymm2
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
ret <4 x double> %res
@ -5180,6 +5300,7 @@ define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
; CHECK-NEXT: ## ymm0 = (ymm1 * ymm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x double>, <4 x double>* %ptr_a2
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
@ -5190,6 +5311,7 @@ define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double>
; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz:
; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
; CHECK-NEXT: ## ymm0 = (ymm1 * ymm0) + mem
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x double>, <4 x double>* %ptr_a2
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind

View File

@ -42,7 +42,7 @@ define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fmadd_baa_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %xmm0
; FMA-NEXT: vfmadd132ps (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
@ -52,7 +52,7 @@ define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fmadd_aba_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %xmm0
; FMA-NEXT: vfmadd231ps (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfmadd231ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
@ -62,7 +62,7 @@ define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fmadd_bba_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rdx), %xmm0
; FMA-NEXT: vfmadd213ps (%rcx), %xmm0, %xmm0
; FMA-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm0 * xmm0) + mem
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
@ -73,7 +73,7 @@ define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
; FMA-LABEL: test_x86_fmadd_baa_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %ymm0
; FMA-NEXT: vfmadd132ps (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm0
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
ret <8 x float> %res
@ -83,7 +83,7 @@ define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
; FMA-LABEL: test_x86_fmadd_aba_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %ymm0
; FMA-NEXT: vfmadd231ps (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfmadd231ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm0
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
@ -93,7 +93,7 @@ define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
; FMA-LABEL: test_x86_fmadd_bba_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rdx), %ymm0
; FMA-NEXT: vfmadd213ps (%rcx), %ymm0, %ymm0
; FMA-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm0 * ymm0) + mem
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
@ -136,7 +136,7 @@ define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fmadd_baa_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %xmm0
; FMA-NEXT: vfmadd132pd (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfmadd132pd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
@ -146,7 +146,7 @@ define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fmadd_aba_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %xmm0
; FMA-NEXT: vfmadd231pd (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfmadd231pd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
@ -156,7 +156,7 @@ define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fmadd_bba_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rdx), %xmm0
; FMA-NEXT: vfmadd213pd (%rcx), %xmm0, %xmm0
; FMA-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm0 * xmm0) + mem
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
@ -167,7 +167,7 @@ define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #
; FMA-LABEL: test_x86_fmadd_baa_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %ymm0
; FMA-NEXT: vfmadd132pd (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfmadd132pd {{.*#+}} ymm0 = (ymm0 * mem) + ymm0
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
ret <4 x double> %res
@ -177,7 +177,7 @@ define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #
; FMA-LABEL: test_x86_fmadd_aba_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %ymm0
; FMA-NEXT: vfmadd231pd (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm0 * mem) + ymm0
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
@ -187,7 +187,7 @@ define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #
; FMA-LABEL: test_x86_fmadd_bba_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rdx), %ymm0
; FMA-NEXT: vfmadd213pd (%rcx), %ymm0, %ymm0
; FMA-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm0 * ymm0) + mem
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
@ -231,7 +231,7 @@ define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fnmadd_baa_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %xmm0
; FMA-NEXT: vfnmadd132ps (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
@ -241,7 +241,7 @@ define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fnmadd_aba_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %xmm0
; FMA-NEXT: vfnmadd231ps (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfnmadd231ps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
@ -251,7 +251,7 @@ define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fnmadd_bba_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rdx), %xmm0
; FMA-NEXT: vfnmadd213ps (%rcx), %xmm0, %xmm0
; FMA-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm0 * xmm0) + mem
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
@ -262,7 +262,7 @@ define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0
; FMA-LABEL: test_x86_fnmadd_baa_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %ymm0
; FMA-NEXT: vfnmadd132ps (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
ret <8 x float> %res
@ -272,7 +272,7 @@ define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0
; FMA-LABEL: test_x86_fnmadd_aba_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %ymm0
; FMA-NEXT: vfnmadd231ps (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfnmadd231ps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
@ -282,7 +282,7 @@ define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0
; FMA-LABEL: test_x86_fnmadd_bba_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rdx), %ymm0
; FMA-NEXT: vfnmadd213ps (%rcx), %ymm0, %ymm0
; FMA-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm0 * ymm0) + mem
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
@ -325,7 +325,7 @@ define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fnmadd_baa_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %xmm0
; FMA-NEXT: vfnmadd132pd (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
@ -335,7 +335,7 @@ define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fnmadd_aba_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %xmm0
; FMA-NEXT: vfnmadd231pd (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfnmadd231pd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
@ -345,7 +345,7 @@ define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fnmadd_bba_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rdx), %xmm0
; FMA-NEXT: vfnmadd213pd (%rcx), %xmm0, %xmm0
; FMA-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm0 * xmm0) + mem
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
@ -356,7 +356,7 @@ define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b)
; FMA-LABEL: test_x86_fnmadd_baa_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %ymm0
; FMA-NEXT: vfnmadd132pd (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
ret <4 x double> %res
@ -366,7 +366,7 @@ define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b)
; FMA-LABEL: test_x86_fnmadd_aba_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %ymm0
; FMA-NEXT: vfnmadd231pd (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfnmadd231pd {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
@ -376,7 +376,7 @@ define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b)
; FMA-LABEL: test_x86_fnmadd_bba_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rdx), %ymm0
; FMA-NEXT: vfnmadd213pd (%rcx), %ymm0, %ymm0
; FMA-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm0 * ymm0) + mem
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
@ -419,7 +419,7 @@ define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fmsub_baa_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %xmm0
; FMA-NEXT: vfmsub132ps (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfmsub132ps {{.*#+}} xmm0 = (xmm0 * mem) - xmm0
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
@ -429,7 +429,7 @@ define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fmsub_aba_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %xmm0
; FMA-NEXT: vfmsub231ps (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm0 * mem) - xmm0
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
@ -439,7 +439,7 @@ define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fmsub_bba_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rdx), %xmm0
; FMA-NEXT: vfmsub213ps (%rcx), %xmm0, %xmm0
; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm0 * xmm0) - mem
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
@ -450,7 +450,7 @@ define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
; FMA-LABEL: test_x86_fmsub_baa_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %ymm0
; FMA-NEXT: vfmsub132ps (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfmsub132ps {{.*#+}} ymm0 = (ymm0 * mem) - ymm0
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
ret <8 x float> %res
@ -460,7 +460,7 @@ define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
; FMA-LABEL: test_x86_fmsub_aba_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %ymm0
; FMA-NEXT: vfmsub231ps (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfmsub231ps {{.*#+}} ymm0 = (ymm0 * mem) - ymm0
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
@ -470,7 +470,7 @@ define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
; FMA-LABEL: test_x86_fmsub_bba_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rdx), %ymm0
; FMA-NEXT: vfmsub213ps (%rcx), %ymm0, %ymm0
; FMA-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm0 * ymm0) - mem
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
@ -513,7 +513,7 @@ define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fmsub_baa_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %xmm0
; FMA-NEXT: vfmsub132pd (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfmsub132pd {{.*#+}} xmm0 = (xmm0 * mem) - xmm0
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
@ -523,7 +523,7 @@ define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fmsub_aba_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %xmm0
; FMA-NEXT: vfmsub231pd (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfmsub231pd {{.*#+}} xmm0 = (xmm0 * mem) - xmm0
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
@ -533,7 +533,7 @@ define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fmsub_bba_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rdx), %xmm0
; FMA-NEXT: vfmsub213pd (%rcx), %xmm0, %xmm0
; FMA-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm0 * xmm0) - mem
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
@ -544,7 +544,7 @@ define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #
; FMA-LABEL: test_x86_fmsub_baa_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %ymm0
; FMA-NEXT: vfmsub132pd (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfmsub132pd {{.*#+}} ymm0 = (ymm0 * mem) - ymm0
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
ret <4 x double> %res
@ -554,7 +554,7 @@ define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #
; FMA-LABEL: test_x86_fmsub_aba_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %ymm0
; FMA-NEXT: vfmsub231pd (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfmsub231pd {{.*#+}} ymm0 = (ymm0 * mem) - ymm0
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
@ -564,7 +564,7 @@ define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #
; FMA-LABEL: test_x86_fmsub_bba_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rdx), %ymm0
; FMA-NEXT: vfmsub213pd (%rcx), %ymm0, %ymm0
; FMA-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm0 * ymm0) - mem
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
@ -608,7 +608,7 @@ define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fnmsub_baa_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %xmm0
; FMA-NEXT: vfnmsub132ps (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
@ -618,7 +618,7 @@ define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fnmsub_aba_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %xmm0
; FMA-NEXT: vfnmsub231ps (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfnmsub231ps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
@ -628,7 +628,7 @@ define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
; FMA-LABEL: test_x86_fnmsub_bba_ps:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rdx), %xmm0
; FMA-NEXT: vfnmsub213ps (%rcx), %xmm0, %xmm0
; FMA-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm0 * xmm0) - mem
; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
@ -639,7 +639,7 @@ define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0
; FMA-LABEL: test_x86_fnmsub_baa_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %ymm0
; FMA-NEXT: vfnmsub132ps (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
ret <8 x float> %res
@ -649,7 +649,7 @@ define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0
; FMA-LABEL: test_x86_fnmsub_aba_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rcx), %ymm0
; FMA-NEXT: vfnmsub231ps (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfnmsub231ps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
@ -659,7 +659,7 @@ define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0
; FMA-LABEL: test_x86_fnmsub_bba_ps_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovaps (%rdx), %ymm0
; FMA-NEXT: vfnmsub213ps (%rcx), %ymm0, %ymm0
; FMA-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm0 * ymm0) - mem
; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
@ -702,7 +702,7 @@ define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fnmsub_baa_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %xmm0
; FMA-NEXT: vfnmsub132pd (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
@ -712,7 +712,7 @@ define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fnmsub_aba_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %xmm0
; FMA-NEXT: vfnmsub231pd (%rdx), %xmm0, %xmm0
; FMA-NEXT: vfnmsub231pd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
@ -722,7 +722,7 @@ define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0
; FMA-LABEL: test_x86_fnmsub_bba_pd:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rdx), %xmm0
; FMA-NEXT: vfnmsub213pd (%rcx), %xmm0, %xmm0
; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm0 * xmm0) - mem
; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
@ -733,7 +733,7 @@ define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b)
; FMA-LABEL: test_x86_fnmsub_baa_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %ymm0
; FMA-NEXT: vfnmsub132pd (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
ret <4 x double> %res
@ -743,7 +743,7 @@ define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b)
; FMA-LABEL: test_x86_fnmsub_aba_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rcx), %ymm0
; FMA-NEXT: vfnmsub231pd (%rdx), %ymm0, %ymm0
; FMA-NEXT: vfnmsub231pd {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
@ -753,7 +753,7 @@ define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b)
; FMA-LABEL: test_x86_fnmsub_bba_pd_y:
; FMA: # %bb.0:
; FMA-NEXT: vmovapd (%rdx), %ymm0
; FMA-NEXT: vfnmsub213pd (%rcx), %ymm0, %ymm0
; FMA-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm0 * ymm0) - mem
; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res

View File

@ -8,11 +8,13 @@ define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss:
@ -20,6 +22,7 @@ define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
@ -29,12 +32,14 @@ define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_ss:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xa9,0xca]
; CHECK-FMA-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_ss:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca]
; CHECK-AVX512VL-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
@ -43,6 +48,7 @@ define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1,
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
@ -53,11 +59,13 @@ define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_sd:
@ -65,6 +73,7 @@ define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
@ -74,12 +83,14 @@ define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_sd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
; CHECK-FMA-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_sd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
; CHECK-AVX512VL-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
@ -88,6 +99,7 @@ define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
@ -98,11 +110,13 @@ define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps:
@ -110,6 +124,7 @@ define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
@ -120,11 +135,13 @@ define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd:
@ -132,6 +149,7 @@ define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
@ -142,11 +160,13 @@ define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) + ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) + ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps_256:
@ -154,6 +174,7 @@ define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1,
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
@ -164,11 +185,13 @@ define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) + ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) + ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd_256:
@ -176,6 +199,7 @@ define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
@ -187,11 +211,13 @@ define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xab,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ss:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xab,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss:
@ -199,6 +225,7 @@ define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
@ -208,12 +235,14 @@ define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_ss:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xab,0xca]
; CHECK-FMA-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2
; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_ss:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca]
; CHECK-AVX512VL-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2
; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
@ -222,6 +251,7 @@ define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1,
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
@ -232,11 +262,13 @@ define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_sd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_sd:
@ -244,6 +276,7 @@ define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
@ -253,12 +286,14 @@ define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_sd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xab,0xca]
; CHECK-FMA-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2
; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_sd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca]
; CHECK-AVX512VL-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2
; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
@ -267,6 +302,7 @@ define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
@ -277,11 +313,13 @@ define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps:
@ -289,6 +327,7 @@ define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
@ -299,11 +338,13 @@ define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd:
@ -311,6 +352,7 @@ define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
@ -321,11 +363,13 @@ define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) - ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) - ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps_256:
@ -333,6 +377,7 @@ define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1,
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
@ -343,11 +388,13 @@ define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) - ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) - ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd_256:
@ -355,6 +402,7 @@ define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
@ -366,11 +414,13 @@ define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xad,0xc2]
; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ss:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xad,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss:
@ -378,6 +428,7 @@ define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
@ -387,12 +438,14 @@ define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_ss:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xad,0xca]
; CHECK-FMA-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2
; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_ss:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca]
; CHECK-AVX512VL-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2
; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
@ -401,6 +454,7 @@ define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
@ -411,11 +465,13 @@ define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_sd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_sd:
@ -423,6 +479,7 @@ define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
@ -432,12 +489,14 @@ define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double>
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_sd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xad,0xca]
; CHECK-FMA-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2
; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_sd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca]
; CHECK-AVX512VL-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2
; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
@ -446,6 +505,7 @@ define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double>
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
@ -456,11 +516,13 @@ define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xac,0xc2]
; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xac,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps:
@ -468,6 +530,7 @@ define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
@ -478,11 +541,13 @@ define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd:
@ -490,6 +555,7 @@ define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
@ -500,11 +566,13 @@ define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xac,0xc2]
; CHECK-FMA-NEXT: # ymm0 = -(ymm1 * ymm0) + ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xac,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = -(ymm1 * ymm0) + ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps_256:
@ -512,6 +580,7 @@ define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
@ -522,11 +591,13 @@ define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double>
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
; CHECK-FMA-NEXT: # ymm0 = -(ymm1 * ymm0) + ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = -(ymm1 * ymm0) + ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd_256:
@ -534,6 +605,7 @@ define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double>
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) + mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
@ -545,11 +617,13 @@ define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ss:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss:
@ -557,6 +631,7 @@ define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
@ -566,12 +641,14 @@ define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_ss:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xaf,0xca]
; CHECK-FMA-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2
; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_ss:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca]
; CHECK-AVX512VL-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2
; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
@ -580,6 +657,7 @@ define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
@ -590,11 +668,13 @@ define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_sd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_sd:
@ -602,6 +682,7 @@ define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
@ -611,12 +692,14 @@ define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double>
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_sd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
; CHECK-FMA-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2
; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_sd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
; CHECK-AVX512VL-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2
; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
@ -625,6 +708,7 @@ define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double>
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
@ -635,11 +719,13 @@ define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xae,0xc2]
; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xae,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps:
@ -647,6 +733,7 @@ define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
@ -657,11 +744,13 @@ define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
; CHECK-FMA-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd:
@ -669,6 +758,7 @@ define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1,
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
@ -679,11 +769,13 @@ define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xae,0xc2]
; CHECK-FMA-NEXT: # ymm0 = -(ymm1 * ymm0) - ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xae,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = -(ymm1 * ymm0) - ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps_256:
@ -691,6 +783,7 @@ define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
@ -701,11 +794,13 @@ define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double>
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
; CHECK-FMA-NEXT: # ymm0 = -(ymm1 * ymm0) - ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = -(ymm1 * ymm0) - ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd_256:
@ -713,6 +808,7 @@ define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double>
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) - mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
@ -724,11 +820,13 @@ define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) +/- xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) +/- xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps:
@ -736,6 +834,7 @@ define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1,
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) +/- mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
@ -746,11 +845,13 @@ define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) +/- xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) +/- xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd:
@ -758,6 +859,7 @@ define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) +/- mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
@ -768,11 +870,13 @@ define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) +/- ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) +/- ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps_256:
@ -780,6 +884,7 @@ define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) +/- mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
@ -790,11 +895,13 @@ define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) +/- ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) +/- ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd_256:
@ -802,6 +909,7 @@ define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) +/- mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
@ -813,11 +921,13 @@ define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1,
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) -/+ xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) -/+ xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps:
@ -825,6 +935,7 @@ define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1,
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) -/+ mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
@ -835,11 +946,13 @@ define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
; CHECK-FMA-NEXT: # xmm0 = (xmm1 * xmm0) -/+ xmm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
; CHECK-AVX512VL-NEXT: # xmm0 = (xmm1 * xmm0) -/+ xmm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd:
@ -847,6 +960,7 @@ define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00]
; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) -/+ mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
@ -857,11 +971,13 @@ define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) -/+ ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) -/+ ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps_256:
@ -869,6 +985,7 @@ define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %
; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) -/+ mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
@ -879,11 +996,13 @@ define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
; CHECK-FMA-NEXT: # ymm0 = (ymm1 * ymm0) -/+ ymm2
; CHECK-FMA-NEXT: retq # encoding: [0xc3]
;
; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd_256:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
; CHECK-AVX512VL-NEXT: # ymm0 = (ymm1 * ymm0) -/+ ymm2
; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd_256:
@ -891,6 +1010,7 @@ define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double
; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00]
; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) -/+ mem
; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -101,8 +101,8 @@ define float @f32_one_step(float %x) #1 {
; FMA-RECIP-LABEL: f32_one_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: f32_one_step:
@ -128,8 +128,8 @@ define float @f32_one_step(float %x) #1 {
; HASWELL-LABEL: f32_one_step:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_one_step:
@ -145,15 +145,15 @@ define float @f32_one_step(float %x) #1 {
; KNL-LABEL: f32_one_step:
; KNL: # %bb.0:
; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: f32_one_step:
; SKX: # %bb.0:
; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 1.0, %x
ret float %div
@ -196,10 +196,10 @@ define float @f32_two_step(float %x) #2 {
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
; FMA-RECIP-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: f32_two_step:
@ -235,10 +235,10 @@ define float @f32_two_step(float %x) #2 {
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_two_step:
@ -260,10 +260,10 @@ define float @f32_two_step(float %x) #2 {
; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: f32_two_step:
@ -271,10 +271,10 @@ define float @f32_two_step(float %x) #2 {
; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.33]
; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 1.0, %x
ret float %div
@ -364,8 +364,8 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; FMA-RECIP-LABEL: v4f32_one_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v4f32_one_step:
@ -392,8 +392,8 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step:
@ -410,15 +410,15 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; KNL: # %bb.0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v4f32_one_step:
; SKX: # %bb.0:
; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
@ -461,10 +461,10 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
; FMA-RECIP-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v4f32_two_step:
@ -500,10 +500,10 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_two_step:
@ -525,10 +525,10 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v4f32_two_step:
@ -536,10 +536,10 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
@ -639,8 +639,8 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; FMA-RECIP-LABEL: v8f32_one_step:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v8f32_one_step:
@ -667,8 +667,8 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step:
@ -685,15 +685,15 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; KNL: # %bb.0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v8f32_one_step:
; SKX: # %bb.0:
; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
@ -749,10 +749,10 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3
; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
; FMA-RECIP-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v8f32_two_step:
@ -788,10 +788,10 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_two_step:
@ -813,10 +813,10 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v8f32_two_step:
@ -824,10 +824,10 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:0.33]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div

View File

@ -96,8 +96,8 @@ define float @f32_one_step_2(float %x) #1 {
; FMA-RECIP-LABEL: f32_one_step_2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; FMA-RECIP-NEXT: retq
;
@ -126,8 +126,8 @@ define float @f32_one_step_2(float %x) #1 {
; HASWELL-LABEL: f32_one_step_2:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
@ -145,16 +145,16 @@ define float @f32_one_step_2(float %x) #1 {
; KNL-LABEL: f32_one_step_2:
; KNL: # %bb.0:
; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: f32_one_step_2:
; SKX: # %bb.0:
; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.33]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 3456.0, %x
@ -190,8 +190,8 @@ define float @f32_one_step_2_divs(float %x) #1 {
; FMA-RECIP-LABEL: f32_one_step_2_divs:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1
; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
; FMA-RECIP-NEXT: retq
@ -223,8 +223,8 @@ define float @f32_one_step_2_divs(float %x) #1 {
; HASWELL-LABEL: f32_one_step_2_divs:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
@ -244,8 +244,8 @@ define float @f32_one_step_2_divs(float %x) #1 {
; KNL-LABEL: f32_one_step_2_divs:
; KNL: # %bb.0:
; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
; KNL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
@ -253,8 +253,8 @@ define float @f32_one_step_2_divs(float %x) #1 {
; SKX-LABEL: f32_one_step_2_divs:
; SKX: # %bb.0:
; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.33]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
; SKX-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@ -302,10 +302,10 @@ define float @f32_two_step_2(float %x) #2 {
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
; FMA-RECIP-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
; FMA-RECIP-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; FMA-RECIP-NEXT: retq
;
@ -344,10 +344,10 @@ define float @f32_two_step_2(float %x) #2 {
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
@ -371,10 +371,10 @@ define float @f32_two_step_2(float %x) #2 {
; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
@ -383,10 +383,10 @@ define float @f32_two_step_2(float %x) #2 {
; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.33]
; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.33]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 6789.0, %x
@ -420,8 +420,8 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; FMA-RECIP-LABEL: v4f32_one_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
; FMA-RECIP-NEXT: retq
;
@ -451,8 +451,8 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
@ -471,16 +471,16 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; KNL: # %bb.0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v4f32_one_step2:
; SKX: # %bb.0:
; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
@ -516,8 +516,8 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; FMA-RECIP-LABEL: v4f32_one_step_2_divs:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1
; FMA-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-RECIP-NEXT: retq
@ -550,8 +550,8 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
@ -572,8 +572,8 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; KNL: # %bb.0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
@ -581,8 +581,8 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; SKX-LABEL: v4f32_one_step_2_divs:
; SKX: # %bb.0:
; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@ -630,10 +630,10 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
; FMA-RECIP-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
; FMA-RECIP-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
; FMA-RECIP-NEXT: retq
;
@ -672,10 +672,10 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
@ -699,10 +699,10 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
@ -711,10 +711,10 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
@ -756,8 +756,8 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; FMA-RECIP-LABEL: v8f32_one_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; FMA-RECIP-NEXT: retq
;
@ -787,8 +787,8 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
@ -807,16 +807,16 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; KNL: # %bb.0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v8f32_one_step2:
; SKX: # %bb.0:
; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
@ -861,8 +861,8 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; FMA-RECIP-LABEL: v8f32_one_step_2_divs:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1
; FMA-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
; FMA-RECIP-NEXT: retq
@ -895,8 +895,8 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
@ -917,8 +917,8 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; KNL: # %bb.0:
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
@ -926,8 +926,8 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; SKX-LABEL: v8f32_one_step_2_divs:
; SKX: # %bb.0:
; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@ -989,10 +989,10 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3
; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
; FMA-RECIP-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
; FMA-RECIP-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; FMA-RECIP-NEXT: retq
;
@ -1031,10 +1031,10 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
; HASWELL-NEXT: retq # sched: [7:1.00]
;
@ -1058,10 +1058,10 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
@ -1070,10 +1070,10 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:0.33]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.33]
; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.33]
; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x