diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 2fb2ed2104c7..95d2ad4a248e 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -565,6 +565,12 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) // BMI/BMI2 foldable instructions { X86::RORX32ri, X86::RORX32mi, 0 }, { X86::RORX64ri, X86::RORX64mi, 0 }, + { X86::SARX32rr, X86::SARX32rm, 0 }, + { X86::SARX64rr, X86::SARX64rm, 0 }, + { X86::SHRX32rr, X86::SHRX32rm, 0 }, + { X86::SHRX64rr, X86::SHRX64rm, 0 }, + { X86::SHLX32rr, X86::SHLX32rm, 0 }, + { X86::SHLX64rr, X86::SHLX64rm, 0 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td index fe7d0ecf8965..893488c159ea 100644 --- a/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -896,4 +896,59 @@ let Predicates = [HasBMI2] in { (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>; def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)), (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>; + + // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not + // immedidate shift, i.e. the following code is considered better + // + // mov %edi, %esi + // shl $imm, %esi + // ... %edi, ... + // + // than + // + // movb $imm, %sil + // shlx %sil, %edi, %esi + // ... %edi, ... + // + let AddedComplexity = 1 in { + def : Pat<(sra GR32:$src1, GR8:$src2), + (SARX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(sra GR64:$src1, GR8:$src2), + (SARX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(srl GR32:$src1, GR8:$src2), + (SHRX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(srl GR64:$src1, GR8:$src2), + (SHRX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(shl GR32:$src1, GR8:$src2), + (SHLX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(shl GR64:$src1, GR8:$src2), + (SHLX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + } + + // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor + // + // mov (%ecx), %esi + // shl $imm, $esi + // + // over + // + // movb $imm %al + // shlx %al, (%ecx), %esi + // + // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole + // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible. } diff --git a/llvm/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/llvm/test/CodeGen/X86/phys_subreg_coalesce-3.ll index 51320dd6d0ab..2a20e7ad6f15 100644 --- a/llvm/test/CodeGen/X86/phys_subreg_coalesce-3.ll +++ b/llvm/test/CodeGen/X86/phys_subreg_coalesce-3.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s +; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s ; rdar://5571034 ; This requires physreg joining, %vreg13 is live everywhere: diff --git a/llvm/test/CodeGen/X86/shift-bmi2.ll b/llvm/test/CodeGen/X86/shift-bmi2.ll new file mode 100644 index 000000000000..d1f321f17738 --- /dev/null +++ b/llvm/test/CodeGen/X86/shift-bmi2.ll @@ -0,0 +1,178 @@ +; RUN: llc -mtriple=i386-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI2 %s +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI264 %s + +define i32 @shl32(i32 %x, i32 %shamt) nounwind uwtable readnone { +entry: + %shl = shl i32 %x, %shamt +; BMI2: shl32 +; BMI2: shlxl +; BMI2: ret +; BMI264: shl32 +; BMI264: shlxl +; BMI264: ret + ret i32 %shl +} + +define i32 @shl32i(i32 %x) nounwind uwtable readnone { +entry: + %shl = shl i32 %x, 5 +; BMI2: shl32i +; BMI2-NOT: shlxl +; BMI2: ret +; BMI264: shl32i +; BMI264-NOT: shlxl +; BMI264: ret + ret i32 %shl +} + +define i32 @shl32p(i32* %p, i32 %shamt) nounwind uwtable readnone { +entry: + %x = load i32* %p + %shl = shl i32 %x, %shamt +; BMI2: shl32p +; BMI2: shlxl %{{.+}}, ({{.+}}), %{{.+}} +; BMI2: ret +; BMI264: shl32p +; BMI264: shlxl %{{.+}}, ({{.+}}), %{{.+}} +; BMI264: ret + ret i32 %shl +} + +define i32 @shl32pi(i32* %p) nounwind uwtable readnone { +entry: + %x = load i32* %p + %shl = shl i32 %x, 5 +; BMI2: shl32pi +; BMI2-NOT: shlxl +; BMI2: ret +; BMI264: shl32pi +; BMI264-NOT: shlxl +; BMI264: ret + ret i32 %shl +} + +define i64 @shl64(i64 %x, i64 %shamt) nounwind uwtable readnone { +entry: + %shl = shl i64 %x, %shamt +; BMI264: shl64 +; BMI264: shlxq +; BMI264: ret + ret i64 %shl +} + +define i64 @shl64i(i64 %x) nounwind uwtable readnone { +entry: + %shl = shl i64 %x, 7 +; BMI264: shl64i +; BMI264-NOT: shlxq +; BMI264: ret + ret i64 %shl +} + +define i64 @shl64p(i64* %p, i64 %shamt) nounwind uwtable readnone { +entry: + %x = load i64* %p + %shl = shl i64 %x, %shamt +; BMI264: shl64p +; BMI264: shlxq %{{.+}}, ({{.+}}), %{{.+}} +; BMI264: ret + ret i64 %shl +} + +define i64 @shl64pi(i64* %p) nounwind uwtable readnone { +entry: + %x = load i64* %p + %shl = shl i64 %x, 7 +; BMI264: shl64p +; BMI264-NOT: shlxq +; BMI264: ret + ret i64 %shl +} + +define i32 @lshr32(i32 %x, i32 %shamt) nounwind uwtable readnone { +entry: + %shl = lshr i32 %x, %shamt +; BMI2: lshr32 +; BMI2: shrxl +; BMI2: ret +; BMI264: lshr32 +; BMI264: shrxl +; BMI264: ret + ret i32 %shl +} + +define i32 @lshr32p(i32* %p, i32 %shamt) nounwind uwtable readnone { +entry: + %x = load i32* %p + %shl = lshr i32 %x, %shamt +; BMI2: lshr32p +; BMI2: shrxl %{{.+}}, ({{.+}}), %{{.+}} +; BMI2: ret +; BMI264: lshr32 +; BMI264: shrxl %{{.+}}, ({{.+}}), %{{.+}} +; BMI264: ret + ret i32 %shl +} + +define i64 @lshr64(i64 %x, i64 %shamt) nounwind uwtable readnone { +entry: + %shl = lshr i64 %x, %shamt +; BMI264: lshr64 +; BMI264: shrxq +; BMI264: ret + ret i64 %shl +} + +define i64 @lshr64p(i64* %p, i64 %shamt) nounwind uwtable readnone { +entry: + %x = load i64* %p + %shl = lshr i64 %x, %shamt +; BMI264: lshr64p +; BMI264: shrxq %{{.+}}, ({{.+}}), %{{.+}} +; BMI264: ret + ret i64 %shl +} + +define i32 @ashr32(i32 %x, i32 %shamt) nounwind uwtable readnone { +entry: + %shl = ashr i32 %x, %shamt +; BMI2: ashr32 +; BMI2: sarxl +; BMI2: ret +; BMI264: ashr32 +; BMI264: sarxl +; BMI264: ret + ret i32 %shl +} + +define i32 @ashr32p(i32* %p, i32 %shamt) nounwind uwtable readnone { +entry: + %x = load i32* %p + %shl = ashr i32 %x, %shamt +; BMI2: ashr32p +; BMI2: sarxl %{{.+}}, ({{.+}}), %{{.+}} +; BMI2: ret +; BMI264: ashr32 +; BMI264: sarxl %{{.+}}, ({{.+}}), %{{.+}} +; BMI264: ret + ret i32 %shl +} + +define i64 @ashr64(i64 %x, i64 %shamt) nounwind uwtable readnone { +entry: + %shl = ashr i64 %x, %shamt +; BMI264: ashr64 +; BMI264: sarxq +; BMI264: ret + ret i64 %shl +} + +define i64 @ashr64p(i64* %p, i64 %shamt) nounwind uwtable readnone { +entry: + %x = load i64* %p + %shl = ashr i64 %x, %shamt +; BMI264: ashr64p +; BMI264: sarxq %{{.+}}, ({{.+}}), %{{.+}} +; BMI264: ret + ret i64 %shl +} diff --git a/llvm/test/CodeGen/X86/targetLoweringGeneric.ll b/llvm/test/CodeGen/X86/targetLoweringGeneric.ll index ba5f8f83619f..a773e9daeff8 100644 --- a/llvm/test/CodeGen/X86/targetLoweringGeneric.ll +++ b/llvm/test/CodeGen/X86/targetLoweringGeneric.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=i386-apple-darwin9 -fast-isel=false -O0 < %s | FileCheck %s +; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=corei7 -fast-isel=false -O0 < %s | FileCheck %s ; Gather non-machine specific tests for the transformations in ; CodeGen/SelectionDAG/TargetLowering. Currently, these